BDE 4.14.0 Production release
Loading...
Searching...
No Matches
bdlb_hashutil.h
Go to the documentation of this file.
1/// @file bdlb_hashutil.h
2///
3/// The content of this file has been pre-processed for Doxygen.
4///
5
6
7// bdlb_hashutil.h -*-C++-*-
8#ifndef INCLUDED_BDLB_HASHUTIL
9#define INCLUDED_BDLB_HASHUTIL
10
11#include <bsls_ident.h>
12BSLS_IDENT("$Id: $")
13
14/// @defgroup bdlb_hashutil bdlb_hashutil
15/// @brief Provide a utility of hash functions.
16/// @addtogroup bdl
17/// @{
18/// @addtogroup bdlb
19/// @{
20/// @addtogroup bdlb_hashutil
21/// @{
22///
23/// <h1> Outline </h1>
24/// * <a href="#bdlb_hashutil-purpose"> Purpose</a>
25/// * <a href="#bdlb_hashutil-classes"> Classes </a>
26/// * <a href="#bdlb_hashutil-description"> Description </a>
27/// * <a href="#bdlb_hashutil-hashing-fundamental-types"> Hashing Fundamental Types </a>
28/// * <a href="#bdlb_hashutil-hashing-user-defined-types"> Hashing User-defined Types </a>
29/// * <a href="#bdlb_hashutil-definitions-and-formal-statement-of-no-funneling-property"> Definitions and Formal Statement of No-Funneling Property </a>
30/// * <a href="#bdlb_hashutil-usage"> Usage </a>
31/// * <a href="#bdlb_hashutil-examples-introduction"> Examples Introduction </a>
32/// * <a href="#bdlb_hashutil-example-1-using-chaining"> Example 1: Using Chaining </a>
33/// * <a href="#bdlb_hashutil-example-2-using-double-hashing"> Example 2: Using Double-Hashing </a>
34/// * <a href="#bdlb_hashutil-example-3-creating-a-generator-of-sequential-integers"> Example 3: Creating a Generator of Sequential Integers </a>
35/// * <a href="#bdlb_hashutil-example-4-creating-a-character-string-generator"> Example 4: Creating a Character String Generator </a>
36/// * <a href="#bdlb_hashutil-example-5-creating-a-multiple-field-keys-generator"> Example 5: Creating a Multiple-Field Keys Generator </a>
37/// * <a href="#bdlb_hashutil-example-6-experimental-results"> Example 6: Experimental Results </a>
38///
39/// # Purpose {#bdlb_hashutil-purpose}
40/// Provide a utility of hash functions.
41///
42/// # Classes {#bdlb_hashutil-classes}
43///
44/// - bdlb::HashUtil: utility for hash functions
45///
46/// @see bdlc_hashtable2, bdeimp_inthash, bdeimp_strhash, bdlde_crc32
47///
48/// # Description {#bdlb_hashutil-description}
49/// This component defines a utility `struct`, `bdlb::HashUtil`,
50/// that provides two hash functions, `hash1` and `hash2`. These hash functions
51/// have a good funneling property, which can be formalized as documented below
52/// (see {Definitions and Formal Statement of No-Funneling Property}). In
53/// short, these hashes have the property that even if two inputs differ in only
54/// a few bits, they have a negligible probability of producing collisions. In
55/// addition, these two hash functions produce full 32-bit results and allow
56/// hash table sizes to be a power of two. They are fast and reliable. They
57/// should work equally well on all types of keys.
58///
59/// The two hash functions `hash1` and `hash2` have similar characteristics.
60/// `hash2` is a bit faster than `hash1`, especially for longer keys; it also
61/// does not produce many collisions as determined experimentally for the key
62/// types given in the example below. On the other hand, it does not have the
63/// formal no-funnel guarantees of `hash1` which is reasonably fast and provably
64/// achieves low collision probability.
65///
66/// Furthermore, this component provides a third hash function `hash0` that
67/// performs a simple hash by taking the modulus of a user specified size on the
68/// input key. This hash function has a poor distribution, as the function
69/// exhibits a clustering effect when the input keys only differ slightly.
70/// However, `hash0` is about 6 to 8 times faster than `hash1` and `hash2`.
71///
72/// In general, use `hash0` when speed matters, and `hash1` or `hash2` when a
73/// more thorough hash is needed (e.g., keys exhibit a pattern that results in
74/// many collision when using `bdeimp`, or cost of collision is high but hashing
75/// is not a bottleneck).
76///
77/// ## Hashing Fundamental Types {#bdlb_hashutil-hashing-fundamental-types}
78///
79///
80/// In order to facilitate hashing user-defined types (see next section), this
81/// component supports hashing fundamental types in a manner that is both
82/// seemingly random (i.e., the hash is good) but predictable (i.e., identical
83/// on all platforms, irrespective of endianness). Note that the following
84/// expressions for some `key` of a fundamental integral type:
85/// @code
86/// bdlb::HashUtil::hash1((const char *)&key, sizeof(key)); // not recommended
87/// bdlb::HashUtil::hash2((const char *)&key, sizeof(key)); // not recommended
88/// @endcode
89/// return values that differ on big- and little-endian platforms if the key
90/// size is more than one byte. Instead, the recommended way to hash this `key`
91/// is simply to:
92/// @code
93/// bdlb::HashUtil::hash1(key);
94/// bdlb::HashUtil::hash2(key);
95/// @endcode
96/// This works if `key` is of one of the following fundamental types:
97/// @code
98/// char, signed char, unsigned char,
99/// short, unsigned short,
100/// int, unsigned int,
101/// long, unsigned long,
102/// bsls::Types::Int64, bsls::Types::Uint64,
103/// float, double,
104/// const void *
105/// @endcode
106///
107/// ## Hashing User-defined Types {#bdlb_hashutil-hashing-user-defined-types}
108///
109///
110/// This component can be used for hashing a user-defined type as well. The
111/// recommended way to do this is to provide a `hash` class method for the type,
112/// that takes an instance of the class and a size. For instance, let us
113/// consider the following class (the meaning and implementation are irrelevant
114/// here) with the following private data members:
115/// @code
116/// /// This class is an aggregate.
117/// class UserDefinedTickerType {
118///
119/// enum { CUSIP_LENGTH = 10 };
120///
121/// // PRIVATE DATA MEMBERS
122/// bool d_isConstant; // not recommended, because of padding
123/// char d_cusip[CUSIP_LENGTH];
124/// int d_value;
125/// @endcode
126/// A hash class method can be provided and documented as follows:
127/// @code
128/// public:
129/// // CLASS METHODS
130///
131/// // Return a hash value calculated from the specified `object` using
132/// // the specified `size` as the number of slots. The hash value
133/// // is guaranteed to be in the range [0, size).
134/// static int hash(const UserDefinedTickerType& object, int size);
135///
136/// // Rest of the class definition omitted...
137/// };
138/// @endcode
139/// The implementation of `hash` can be very simple, depending on the semantics
140/// of the class. Here we assume that `d_isConstant` is an implementation
141/// detail and that data members `d_cusip` and `d_value` should participate in
142/// the hash and that the character string should be hashed by value. The
143/// implementation follows:
144/// @code
145/// int UserDefinedTickerType::hash(const UserDefinedTickerType& object,
146/// int size)
147/// {
148/// return (bdlb::HashUtil::hash1(object.d_cusip, CUSIP_LENGTH) +
149/// bdlb::HashUtil::hash1(object.d_value)) % size;
150/// }
151/// @endcode
152/// Note that more intricate combinations of the individual hashes in the
153/// aggregate can be appropriate if the individual members in the aggregate my
154/// exhibit a pattern.
155///
156/// CAVEAT: When hashing user-defined classes, because of the potential padding
157/// in the class footprint, the simple-minded hashes
158/// @code
159/// const UserDefinedTickerType& key;
160/// bdlb::HashUtil::hash1((const char *)&key, sizeof(key)); // wrong, because
161/// bdlb::HashUtil::hash2((const char *)&key, sizeof(key)); // of padding
162/// @endcode
163/// may actually *not* produce the same hash values for copies of the same
164/// object.
165///
166/// ## Definitions and Formal Statement of No-Funneling Property {#bdlb_hashutil-definitions-and-formal-statement-of-no-funneling-property}
167///
168///
169/// Let us say a given bit in the input affects a given bit in the output (which
170/// has `v` bits) if two keys differing only at that input bit will differ at
171/// the given output bit about half the time. We define a hash to be a
172/// funneling hash if there is some subset of size `t` of all the input bits
173/// that can affect only a subset of size `u` of bits in the output, and `t > u`
174/// and `v > u`, and we say that the hash function has a funnel of those `t`
175/// input bits into those `u` bits. In that case, then `u` of those `t` bits
176/// can cancel out the effects of the other `t - u`, and so the set of keys
177/// differing only in the input bits of the funnel can produce no more than half
178/// that number of hash values. (Those `2^t` keys can produce no more than
179/// `2^u` out of `2^v` hash values.) Differing in only a few bits is a common
180/// pattern in human and computer keys, so a funneling hash is seriously flawed.
181///
182/// This component offers a hash function (`hash1`) that has no funnels, except
183/// for a funnel of 32 bits into 31 bits, which is a negligible vulnerability.
184/// The `hash2` does not have known no-funneling properties, but operates on the
185/// same principle and is just as good (and faster) in experiments with chaining
186/// as in the usage below. In addition, these two hash functions produce full
187/// 32-bit results and allow hash table sizes to be a power of two.
188///
189/// For further details and references, the interested reader is invited to
190/// consult `http://burtleburtle.net/bob/hash/evahash.html`.
191///
192/// ## Usage {#bdlb_hashutil-usage}
193///
194///
195/// This section illustrates intended use of this component.
196///
197/// ### Examples Introduction {#bdlb_hashutil-examples-introduction}
198///
199///
200/// We illustrate the usage of this component by some experiments (code and
201/// results) on the number of collision expected for various usage. For
202/// returning the results of an experiment, we will use the following structure:
203/// @code
204/// struct ExperimentalResult {
205///
206/// // DATA MEMBERS
207/// int d_max; // maximum length of a chain
208/// double d_average; // average length of a chain
209/// double d_sigma; // standard deviation
210///
211/// // CREATORS
212///
213/// /// Create an experimental result reporting the optionally specified
214/// /// `max`, `avg`, and `sigma` values.
215/// explicit
216/// ExperimentalResult(int max = 0, double avg = 0, double sigma = 0)
217/// : d_max(max), d_average(avg), d_sigma(sigma)
218/// {}
219/// };
220/// @endcode
221/// For generating the data, we use a parameterized `GENERATOR` of which we give
222/// three implementations below (sequential integers, character strings, and
223/// multiple-field keys). This template parameter needs to validate the
224/// following syntactic expressions, which we call in this usage example our
225/// `GeneratorConcept`. In those requirements, `generator` is an instance of
226/// `GENERATOR`.
227/// @code
228/// const char *data = generator.data(); // address of storage of current
229/// // value
230/// int length = generator.length(); // length of storage of current
231/// // value
232///
233/// generator.next(); // advance to next value
234/// @endcode
235///
236/// ### Example 1: Using Chaining {#bdlb_hashutil-example-1-using-chaining}
237///
238///
239/// The following code will check the number of collisions for a chaining-based
240/// hash table given a certain distribution of keys:
241/// @code
242/// /// Simulate insertion of the specified `numElements` generated by the
243/// /// specified `input` generator into a hash table of the specified
244/// /// `size` using chaining as the mechanism to resolve collisions.
245/// /// Return the results (maximum and average length of a chain, with
246/// /// standard deviation). `GENERATOR` must be a model of the
247/// /// `GeneratorConcept`. For the good functioning of this function, it
248/// /// is required that `input` never be default nor repeat itself.
249/// template <class GENERATOR>
250/// ExperimentalResult computeChainingCollisions(int numElements,
251/// int size,
252/// GENERATOR input)
253/// {
254/// bsl::vector<int> table(size); // all counters are init'ed to 0
255///
256/// for (int i = 0; i < numElements; ++i, input.next()) {
257/// unsigned int hash = Util::hash1(input.data(),
258/// input.length());
259/// ++table[hash % size];
260/// }
261///
262/// double avgLength = 0; // average number of collisions
263/// double sigmaLength = 0; // standard deviation from average
264/// int maxLength = 0; // maximum length of a chain
265///
266/// for (int k = 0; k < size; ++k) {
267/// avgLength += table[k];
268/// sigmaLength += table[k] * table[k];
269/// maxLength = bsl::max(maxLength, table[k]);
270/// }
271/// avgLength /= size;
272/// sigmaLength = bsl::sqrt(sigmaLength / size - avgLength * avgLength);
273///
274/// return ExperimentalResult(maxLength, avgLength, sigmaLength);
275/// }
276/// @endcode
277///
278/// ### Example 2: Using Double-Hashing {#bdlb_hashutil-example-2-using-double-hashing}
279///
280///
281/// The following code will check the number of collisions for a double-hashing
282/// based hash table (such as @ref bdlc_hashtable2 ) given a certain distribution
283/// of keys:
284/// @code
285/// /// Simulate insertion of the specified `numElements` generated by the
286/// /// specified `input` generator into a hash table of the specified
287/// /// `size` using double hashing as the mechanism to resolve collisions.
288/// /// Return the results (maximum and average length of a chain, with
289/// /// standard deviation). `GENERATOR` must be a model of the
290/// /// `GeneratorConcept`. For the good functioning of this function, it
291/// /// is required that `input` never be default nor repeat itself.
292/// template <class GENERATOR>
293/// ExperimentalResult computeDoubleHashingCollisions(int numElements,
294/// int size,
295/// GENERATOR input)
296/// {
297/// typedef typename GENERATOR::ResultType ResultType;
298/// bsl::vector<ResultType> table(size); // all counters are default-ct'ed
299///
300/// double avgLength = 0; // average number of collisions
301/// double sigmaLength = 0; // standard deviation from average
302/// int maxLength = 0; // maximum length of a chain
303///
304/// for (int i = 0; i < numElements; ++i, input.next()) {
305/// unsigned int hash1 = Util::hash1(input.data(),
306/// input.length());
307///
308/// int chainLength = 0;
309/// int bucket = hash1 % size;
310/// if (ResultType() == table[bucket]) {
311/// table[bucket] = input.current();
312/// } else {
313/// unsigned int hash2 = Util::hash2(input.data(),
314/// input.length());
315/// hash2 = 1 + hash2 % (size - 1);
316///
317/// while (++chainLength < size) {
318/// bucket = (bucket + hash2) % size;
319///
320/// if (ResultType() == table[bucket]) {
321/// table[bucket] = input.current();
322/// break; // while loop
323/// }
324/// }
325/// }
326///
327/// if (chainLength == size) {
328/// bsl::cerr << "\tCould not insert in doubly-hashed table\n";
329/// avgLength = bsl::numeric_limits<double>::infinity();
330/// sigmaLength = bsl::numeric_limits<double>::infinity();
331/// maxLength = static_cast<int>(
332/// bsl::numeric_limits<double>::infinity());
333/// return ExperimentalResult(maxLength, avgLength, sigmaLength);
334/// // RETURN
335/// }
336///
337/// avgLength += chainLength;
338/// sigmaLength += chainLength * chainLength;
339/// maxLength = bsl::max(maxLength, chainLength);
340/// }
341/// avgLength /= numElements;
342/// sigmaLength = bsl::sqrt(sigmaLength / numElements
343/// - avgLength * avgLength);
344///
345/// return ExperimentalResult(maxLength, avgLength, sigmaLength);
346/// }
347/// @endcode
348///
349/// ### Example 3: Creating a Generator of Sequential Integers {#bdlb_hashutil-example-3-creating-a-generator-of-sequential-integers}
350///
351///
352/// Here is a simple generator that returns a sequence of integers. This
353/// generator has a period of `INT_MAX / gcd(increment, INT_MAX+1)`.
354/// @code
355/// class SequentialIntegers {
356/// int d_current;
357/// int d_inc;
358///
359/// public:
360/// // TYPES
361/// typedef int ResultType;
362/// // Type returned by this generator.
363///
364/// // CREATORS
365///
366/// // Create a generator returning integers in a sequence starting at
367/// // the optionally specified `first` integer, with the optionally
368/// // specified `increment`.
369/// explicit SequentialIntegers(int first = 1, int increment = 1)
370/// : d_current(first), d_inc(increment) {}
371///
372/// // MANIPULATORS
373///
374/// /// Advance to the next element.
375/// void next()
376/// {
377/// d_current += d_inc;
378/// }
379///
380/// // ACCESSORS
381///
382/// /// Return current element.
383/// ResultType current() const
384/// {
385/// return d_current;
386/// }
387///
388/// /// Return data buffer of result type.
389/// const char *data() const
390/// {
391/// return reinterpret_cast<const char*>(&d_current);
392/// }
393///
394/// /// Return length of result type (in bytes).
395/// int length() const
396/// {
397/// return sizeof(ResultType);
398/// }
399/// };
400/// @endcode
401///
402/// ### Example 4: Creating a Character String Generator {#bdlb_hashutil-example-4-creating-a-character-string-generator}
403///
404///
405/// Here is a simple generator that returns a sequence of strings that are a
406/// permutation of an initial string. This generator has a period of
407/// `factorial(initial.length())` where `factorial(N)` returns the number of
408/// permutations of `N` distinct elements.
409/// @code
410/// class SequentialStrings {
411/// size_t d_length;
412/// bsl::string d_current;
413///
414/// public:
415/// // TYPES
416/// typedef bsl::string ResultType;
417/// // Type returned by this generator.
418///
419/// // CREATORS
420///
421/// /// Create a generator returning strings in a sequence starting at
422/// /// the specified `initial` string (sorted by characters) and
423/// /// looping through all permutations of `str`. The behavior is
424/// /// undefined unless all the characters of the `initial` string are
425/// /// distinct.
426/// explicit SequentialStrings(bsl::string const& initial)
427/// : d_length(initial.length()), d_current(initial)
428/// {
429/// bsl::sort(d_current.begin(), d_current.end());
430/// assert(bsl::unique(d_current.begin(), d_current.end()) ==
431/// d_current.end());
432/// }
433///
434/// // MANIPULATORS
435///
436/// /// Advance to the next element.
437/// void next()
438/// {
439/// bsl::next_permutation(d_current.begin(), d_current.end());
440/// }
441///
442/// // ACCESSORS
443///
444/// /// Return current element.
445/// ResultType current() const
446/// {
447/// return d_current;
448/// }
449///
450/// /// Return data buffer of result type.
451/// const char *data() const
452/// {
453/// return d_current.data();
454/// }
455///
456/// /// Return length of result type (in bytes).
457/// size_t length() const
458/// {
459/// return d_current.length();
460/// }
461/// };
462/// @endcode
463///
464/// ### Example 5: Creating a Multiple-Field Keys Generator {#bdlb_hashutil-example-5-creating-a-multiple-field-keys-generator}
465///
466///
467/// It is not uncommon for keys to consist of concatenated keys in multiple
468/// combinations. We simulate this by a character string in which each
469/// character loops through a specified number of values. The period of this
470/// generator is the product of the length of each character range.
471/// @code
472/// struct SequentialVector {
473/// bsl::vector<char> d_ranges;
474/// size_t d_length;
475/// bsl::string d_current;
476///
477/// public:
478/// // TYPES
479/// typedef bsl::string ResultType;
480/// // Type returned by this generator.
481///
482/// // CREATORS
483///
484/// /// Create a generator returning strings having the same length as
485/// /// the specified `ranges` (i.e., `ranges.size()`) in a sequence
486/// /// starting at the string with all null characters and looping
487/// /// through all the strings with each character at position `i` in
488/// /// the specified range from 0 until `ranges[i]` (excluded). The
489/// /// behavior is undefined unless `ranges` does not contain zero
490/// /// entries.
491/// explicit SequentialVector(bsl::vector<char> const& ranges)
492/// : d_ranges(ranges)
493/// , d_length(ranges.size())
494/// , d_current(d_length, (char)0)
495/// {
496/// }
497///
498/// // MANIPULATORS
499///
500/// /// Advance to the next element.
501/// void next()
502/// {
503/// for (size_t i = 0;
504/// i < d_length && ++d_current[i] == d_ranges[i]; ++i) {
505/// d_current[i] = 0;
506/// }
507/// }
508///
509/// // ACCESSORS
510///
511/// /// Return current element.
512/// ResultType current() const
513/// {
514/// return d_current;
515/// }
516///
517/// /// Return data buffer of current value.
518/// const char *data() const
519/// {
520/// return d_current.data();
521/// }
522///
523/// /// Return length of current value (in bytes).
524/// size_t length() const
525/// {
526/// return d_current.length();
527/// }
528/// };
529/// @endcode
530///
531/// ### Example 6: Experimental Results {#bdlb_hashutil-example-6-experimental-results}
532///
533///
534/// Checking the above distributions is done in a straightforward manner. We do
535/// this at various load factors (the load factor is the percentage of the table
536/// that actually holds data; it is defined as `N / SIZE` where `N` is the
537/// number of elements and `SIZE` is the size of the hash table). Note that
538/// chaining accommodates arbitrary load factors, while double hashing requires
539/// that the load factor be strictly less than 1.
540/// @code
541/// int usageExample(int verbose, int veryVerbose, int /* veryVeryVerbose */) {
542/// const int SIZE = 10000;
543/// const int INC = SIZE / 5; // load factors for every 20% percentile
544/// const int COLS = (4*SIZE)/INC;
545///
546/// {
547/// ExperimentalResult results[3][COLS];
548/// bsls::Stopwatch timer;
549///
550/// if (verbose) cout << "\nUsing chaining"
551/// << "\n--------------" << endl;
552///
553/// if (verbose) cout << "Sequential Integers\n";
554/// timer.start();
555/// for (int n = INC, i = 0; n < 4*SIZE; n += INC, ++i) {
556/// if (veryVerbose)
557/// cout << "\tWith load factor " <<
558/// static_cast<double>(n) / SIZE << "\n";
559///
560/// results[0][i] = computeChainingCollisions(
561/// n,
562/// SIZE,
563/// SequentialIntegers());
564/// assert(results[0][i].d_average < 1.5 * n / SIZE);
565/// }
566/// if (verbose) cout << "\t... in " << timer.elapsedTime() << "\n";
567/// timer.reset();
568///
569/// if (verbose) cout << "Sequential Strings\n";
570/// timer.start();
571/// for (int n = INC, i = 0; n < 4*SIZE; n += INC, ++i) {
572/// if (veryVerbose)
573/// cout << "\tWith load factor " <<
574/// static_cast<double>(n) / SIZE << "\n";
575///
576/// bsl::string initial("abcdefghij"); // period = 10! = 3628800
577/// results[1][i] = computeChainingCollisions(
578/// n,
579/// SIZE,
580/// SequentialStrings(initial));
581/// assert(results[1][i].d_average < 1.5 * n / SIZE);
582/// }
583/// if (verbose) cout << "\t... in " << timer.elapsedTime() << "\n";
584/// timer.reset();
585///
586/// if (verbose) cout << "Sequential Vector\n";
587/// timer.start();
588/// for (int n = INC, i = 0; n < 4*SIZE; n += INC, ++i) {
589/// if (veryVerbose)
590/// cout << "\tWith load factor " <<
591/// static_cast<double>(n) / SIZE << "\n";
592///
593/// bsl::vector<char> ranges(6, static_cast<char>(4));
594/// // period = 4^6 = 4096
595/// results[2][i] = computeChainingCollisions(
596/// n,
597/// SIZE,
598/// SequentialVector(ranges));
599/// assert(results[2][i].d_average < 1.5 * n / SIZE);
600/// }
601/// if (verbose) cout << "\t... in " << timer.elapsedTime() << "\n";
602/// timer.reset();
603///
604/// if (verbose) {
605/// cout << "\nDisplaying average (max) for chaining:";
606/// cout << "\n--------------------------------------n";
607/// const char *ROW_LABELS[] = { "\nIntegers:",
608/// "\nStrings :",
609/// "\nVector :",
610/// "\nLoad factor:",
611/// };
612/// const int NUM_ROWS = sizeof ROW_LABELS
613/// / sizeof *ROW_LABELS - 1;
614///
615/// cout << ROW_LABELS[NUM_ROWS] << bsl::setprecision(2);
616/// for (int n = INC; n < 4*SIZE; n += INC) {
617/// cout << "\t" << static_cast<double>(n) / SIZE;
618/// }
619/// for (int j = 0; j < NUM_ROWS; ++j) {
620/// cout << ROW_LABELS[j];
621/// for (int i = 0; i < COLS; ++i) {
622/// cout << "\t" << results[j][i].d_average;
623/// cout << "(" << results[j][i].d_max << ")";
624/// }
625/// cout << "\n";
626/// }
627/// }
628/// }
629/// @endcode
630/// We repeat the same steps with double hashing, except that due to the nature
631/// of the collision-resolution mechanism, the tolerance for the average must be
632/// slightly increased to 2.5 times the load factor, when the load factor is
633/// high.
634/// @code
635/// {
636/// // const int SIZE = 1000003; // must be a prime number
637/// const int SIZE = 10007; // must be a prime number
638/// const int INC = SIZE / 5; // load factors for every 20% percentile
639/// const int COLS = SIZE/INC;
640///
641/// ExperimentalResult results[3][COLS];
642/// bsls::Stopwatch timer;
643///
644/// if (verbose) cout << "\nUsing double hashing"
645/// << "\n--------------------" << endl;
646///
647/// if (verbose) cout << "Sequential Integers\n";
648/// timer.start();
649/// for (int n = INC/2, i = 0; n < SIZE; n += INC, ++i) {
650/// if (veryVerbose)
651/// cout << "\tWith load factor " <<
652/// static_cast<double>(n) / SIZE << "\n";
653///
654/// results[0][i] = computeDoubleHashingCollisions(
655/// n,
656/// SIZE,
657/// SequentialIntegers());
658/// assert(results[0][i].d_average < 2.5 * n / SIZE);
659/// }
660/// if (verbose) cout << "\t... in " << timer.elapsedTime() << "\n";
661/// timer.reset();
662///
663/// if (verbose) cout << "Sequential Strings\n";
664/// timer.start();
665/// for (int n = INC/2, i = 0; n < SIZE; n += INC, ++i) {
666/// if (veryVerbose)
667/// cout << "\tWith load factor " <<
668/// static_cast<double>(n) / SIZE << "\n";
669///
670/// bsl::string initial("abcdefghij"); // period = 10! = 3628800
671/// results[1][i] = computeDoubleHashingCollisions(
672/// n,
673/// SIZE,
674/// SequentialStrings(initial));
675/// assert(results[1][i].d_average < 2.5 * n / SIZE);
676/// }
677/// if (verbose) cout << "\t... in " << timer.elapsedTime() << "\n";
678/// timer.reset();
679///
680/// if (verbose) cout << "Sequential Vector\n";
681/// timer.start();
682/// for (int n = INC/2, i = 0; n < SIZE; n += INC, ++i) {
683/// if (veryVerbose)
684/// cout << "\tWith load factor " <<
685/// static_cast<double>(n) / SIZE << "\n";
686///
687/// bsl::vector<char> ranges(7, static_cast<char>(8));
688/// // period = 8^7 = 2097152
689/// results[2][i] = computeDoubleHashingCollisions(
690/// n,
691/// SIZE,
692/// SequentialVector(ranges));
693/// assert(results[2][i].d_average < 2.5 * n / SIZE);
694/// }
695/// if (verbose) cout << "\t... in " << timer.elapsedTime() << "\n";
696/// timer.reset();
697///
698/// if (verbose) {
699/// cout << "\nDisplaying average (max) for double-hashing:";
700/// cout << "\n--------------------------------------------\n";
701/// const char *ROW_LABELS[] = { "\nIntegers:",
702/// "\nStrings :",
703/// "\nVector :",
704/// "\nLoad factor:",
705/// };
706/// const int NUM_ROWS = sizeof ROW_LABELS
707/// / sizeof *ROW_LABELS -1;
708///
709/// cout << ROW_LABELS[NUM_ROWS] << bsl::setprecision(2);
710/// for (int n = INC/2; n < SIZE; n += INC) {
711/// cout << "\t" << static_cast<double>(n) / SIZE;
712/// }
713/// for (int j = 0; j < NUM_ROWS; ++j) {
714/// cout << ROW_LABELS[j];
715/// for (int i = 0; i < COLS; ++i) {
716/// cout << "\t" << results[j][i].d_average;
717/// cout << "(" << results[j][i].d_max << ")";
718/// }
719/// cout << "\n";
720/// }
721/// }
722/// }
723///
724/// return 0;
725/// }
726/// @endcode
727/// The above code produces the following results. The results for chaining are
728/// identical for `hash1` (used in the code above) or `hash2` (code identical
729/// except `hash2` is used in place of `hash1` in `computeChainingCollisions`).
730/// @code
731/// Displaying average(max) for chaining:
732/// ---------------------------------------------------------------------------
733/// Load factor: 0.2 0.4 0.6 0.8 1 1.2 1.4
734/// Integers: 0.2(3) 0.4(5) 0.6(6) 0.8(6) 1(7) 1.2(7) 1.4(8)
735/// Strings : 0.2(4) 0.4(4) 0.6(5) 0.8(7) 1(7) 1.2(7) 1.4(7)
736/// Vector : 0.2(5) 0.4(5) 0.6(10) 0.8(10) 1(15) 1.2(15) 1.4(20)
737///
738/// Load factor: 1.6 1.8 2 2.2 2.4 2.6 2.8
739/// Integers: 1.6(9) 1.8(9) 2(10) 2.2(10) 2.4(10) 2.6(10) 2.8(11)
740/// Strings : 1.6(7) 1.8(8) 2(9) 2.2(11) 2.4(11) 2.6(11) 2.8(11)
741/// Vector : 1.6(20) 1.8(24) 2(25) 2.2(29) 2.4(30) 2.6(34) 2.8(35)
742///
743/// Load factor: 3 3.2 3.4 3.6 3.8
744/// Integers: 3(11) 3.2(11) 3.4(12) 3.6(12) 3.8(13)
745/// Strings : 3(12) 3.2(12) 3.4(13) 3.6(13) 3.8(13)
746/// Vector : 3(39) 3.2(40) 3.4(42) 3.6(45) 3.8(46)
747///
748/// Displaying average(max) for double-hashing:
749/// ---------------------------------------------------------------------------
750/// Load factor: 0.1 0.3 0.5 0.7 0.9
751/// Integers: 0.046(2) 0.20(4) 0.37(10) 0.71(15) 1.6(59)
752/// Strings : 0.064(2) 0.20(6) 0.40(12) 0.75(18) 1.6(50)
753/// Vector : 0.05(2) 0.19(5) 0.58(9) 1.20(15) 2.4(64)
754/// @endcode
755/// @}
756/** @} */
757/** @} */
758
759/** @addtogroup bdl
760 * @{
761 */
762/** @addtogroup bdlb
763 * @{
764 */
765/** @addtogroup bdlb_hashutil
766 * @{
767 */
768
769#include <bdlscm_version.h>
770
771#include <bsls_assert.h>
772#include <bsls_review.h>
773#include <bsls_types.h>
774
775
776namespace bdlb {
777 // ===============
778 // struct HashUtil
779 // ===============
780
781/// This `struct` provides a namespace for hash functions.
782struct HashUtil {
783
784 // CLASS METHODS
785
786 static unsigned int hash0(char key, int modulus);
787 static unsigned int hash0(signed char key, int modulus);
788 static unsigned int hash0(unsigned char key, int modulus);
789 static unsigned int hash0(short key, int modulus);
790 static unsigned int hash0(unsigned short key, int modulus);
791 static unsigned int hash0(int key, int modulus);
792 static unsigned int hash0(unsigned int key, int modulus);
793 static unsigned int hash0(long key, int modulus);
794 static unsigned int hash0(unsigned long key, int modulus);
795 static unsigned int hash0(bsls::Types::Int64 key, int modulus);
796 static unsigned int hash0(bsls::Types::Uint64 key, int modulus);
797 static unsigned int hash0(float key, int modulus);
798 static unsigned int hash0(double key, int modulus);
799 /// Return an unsigned integer hash value in the range from zero to one
800 /// less than the specified `modulus` corresponding to the specified
801 /// `key`. The behavior is undefined unless `0 < modulus < 2^31`. Note
802 /// that `modulus` is expected to be a prime not close to an integral
803 /// power of 2. Also note that specifying a `modulus` of 1 will cause 0
804 /// to be returned for every `value`.
805 static unsigned int hash0(const void *key, int modulus);
806
807 /// Return a pseudo-random unsigned integer in the range from zero to
808 /// one less than the specified `modulus` corresponding to the specified
809 /// null-terminated `string`. The behavior is undefined unless
810 /// `0 < modulus < 2^31`. Note that `modulus` is expected to be a prime
811 /// not close to an integral power of 2. Also note that specifying a
812 /// `modulus` of 1 will cause 0 to be returned for every `string`.
813 static unsigned int hash0(const char *string, int modulus);
814
815 static unsigned int hash0(const char *string,
816 int stringLength,
817 int modulus);
818 /// Return a pseudo-random unsigned integer in the range from zero to
819 /// one less than the specified `modulus` corresponding to the specified
820 /// `string` having the specified `stringLength`. `string` need not be
821 /// null-terminated and may contain embedded null characters. The
822 /// behavior is undefined unless `0 <= stringLength` and
823 /// `0 < modulus < 2^31`. Note that `modulus` is expected to be a prime
824 /// not close to an integral power of 2. Also note that specifying a
825 /// `modulus` of 1 will cause 0 to be returned for every `string`.
826
827 static unsigned int hash1(const char *data, int length);
828
829 /// Return an unsigned integer hash value corresponding to the specified
830 /// `data` of the specified `length` (in bytes). The behavior is
831 /// undefined unless `0 <= length`. Note that if `data` is 0, then
832 /// `length` also must be 0. Also note that every byte in `data` is
833 /// significant; that is, it is not appropriate to cast a `struct`
834 /// address to a `char *` unless the `struct` is packed (no padding).
835 ///
836 /// Both `hash1` and `hash2` return a hash, but both hashes can be
837 /// assumed to be independent (i.e., there are no known correlations
838 /// between the results of the two hash functions given the same input
839 /// data).
840 static unsigned int hash2(const char *data, int length);
841
842 static unsigned int hash1(char key);
843 static unsigned int hash1(signed char key);
844 static unsigned int hash1(unsigned char key);
845 static unsigned int hash1(short key);
846 static unsigned int hash1(unsigned short key);
847 static unsigned int hash1(int key);
848 static unsigned int hash1(unsigned int key);
849 static unsigned int hash1(long key);
850 static unsigned int hash1(unsigned long key);
851 static unsigned int hash1(bsls::Types::Int64 key);
852 static unsigned int hash1(bsls::Types::Uint64 key);
853 static unsigned int hash1(float key);
854 static unsigned int hash1(double key);
855 /// Return an unsigned integer hash value corresponding to the specified
856 /// `key`. Note that the return value is seemingly random (i.e., the
857 /// hash is good) but identical on all platforms (irrespective of
858 /// endianness). Both functions `hash1` and `hash2` return a hash, but
859 /// both hashes can be assumed to be independent (i.e., there are no
860 /// known correlations between the results of both hash functions given
861 /// the same input data).
862 static unsigned int hash1(const void *key);
863
864 static unsigned int hash2(char key);
865 static unsigned int hash2(signed char key);
866 static unsigned int hash2(unsigned char key);
867 static unsigned int hash2(short key);
868 static unsigned int hash2(unsigned short key);
869 static unsigned int hash2(int key);
870 static unsigned int hash2(unsigned int key);
871 static unsigned int hash2(long key);
872 static unsigned int hash2(unsigned long key);
873 static unsigned int hash2(bsls::Types::Int64 key);
874 static unsigned int hash2(bsls::Types::Uint64 key);
875 static unsigned int hash2(float key);
876 static unsigned int hash2(double key);
877 /// Return an unsigned integer hash value corresponding to the specified
878 /// `key`. Note that the return value is seemingly random (i.e., the
879 /// hash is good) but identical on all platforms (irrespective of
880 /// endianness). Both functions `hash1` and `hash2` return a hash, but
881 /// both hashes can be assumed to be independent (i.e., there are no
882 /// known correlations between the results of both hash functions given
883 /// the same input data).
884 static unsigned int hash2(const void *key);
885};
886
887// ============================================================================
888// INLINE DEFINITIONS
889// ============================================================================
890
891 // ---------------
892 // struct HashUtil
893 // ---------------
894
895// CLASS METHODS
896inline
897unsigned int HashUtil::hash0(int key, int modulus)
898{
899 BSLS_ASSERT(0 < modulus);
900
901 if (4 == sizeof(int)) {
902 return static_cast<unsigned int>(key)
903 % static_cast<unsigned int>(modulus); // RETURN
904 }
905
906 return (static_cast<unsigned int>(key) & 0xFFFFFFFF)
907 % static_cast<unsigned int>(modulus);
908}
909
910inline
911unsigned int HashUtil::hash0(bsls::Types::Int64 key, int modulus)
912{
913 BSLS_ASSERT(0 < modulus);
914
915 return ((static_cast<unsigned int>((key >> 32) & 0xFFFFFFFF)
916 % static_cast<unsigned int>(modulus))
917 ^ static_cast<unsigned int>(key & 0xFFFFFFFF))
918 % static_cast<unsigned int>(modulus);
919}
920
921inline
922unsigned int HashUtil::hash0(char key, int modulus)
923{
924 BSLS_ASSERT(0 < modulus);
925
926 return HashUtil::hash0(static_cast<int>(static_cast<unsigned char>(key)),
927 modulus);
928}
929
930inline
931unsigned int HashUtil::hash0(signed char key, int modulus)
932{
933 BSLS_ASSERT(0 < modulus);
934
935 return HashUtil::hash0(static_cast<int>(static_cast<unsigned char>(key)),
936 modulus);
937}
938
939inline
940unsigned int HashUtil::hash0(unsigned char key, int modulus)
941{
942 BSLS_ASSERT(0 < modulus);
943
944 return HashUtil::hash0(static_cast<int>(key), modulus);
945}
946
947inline
948unsigned int HashUtil::hash0(short key, int modulus)
949{
950 BSLS_ASSERT(0 < modulus);
951
952 return HashUtil::hash0(static_cast<int>(static_cast<unsigned short>(key)),
953 modulus);
954}
955
956inline
957unsigned int HashUtil::hash0(unsigned short key, int modulus)
958{
959 BSLS_ASSERT(0 < modulus);
960
961 return HashUtil::hash0(static_cast<int>(key), modulus);
962}
963
964inline
965unsigned int HashUtil::hash0(unsigned int key, int modulus)
966{
967 BSLS_ASSERT(0 < modulus);
968
969 return HashUtil::hash0(static_cast<int>(key), modulus);
970}
971
972inline
973unsigned int HashUtil::hash0(long key, int modulus)
974{
975 BSLS_ASSERT(0 < modulus);
976
977 if (4 == sizeof(long)) {
978 return HashUtil::hash0(
979 static_cast<int>(static_cast<unsigned long>(key)),
980 modulus); // RETURN
981 }
982 else {
983 return HashUtil::hash0(
984 static_cast<bsls::Types::Int64>(static_cast<unsigned long>(key)),
985 modulus); // RETURN
986 }
987}
988
989inline
990unsigned int HashUtil::hash0(unsigned long key, int modulus)
991{
992 BSLS_ASSERT(0 < modulus);
993
994 if (4 == sizeof(unsigned long)) {
995 return HashUtil::hash0(static_cast<int>(key), modulus); // RETURN
996 }
997 else {
998 return HashUtil::hash0(static_cast<bsls::Types::Int64>(key), modulus);
999 // RETURN
1000 }
1001}
1002
1003inline
1004unsigned int HashUtil::hash0(bsls::Types::Uint64 key, int modulus)
1005{
1006 BSLS_ASSERT(0 < modulus);
1007
1008 return HashUtil::hash0(static_cast<bsls::Types::Int64>(key), modulus);
1009}
1010
1011inline
1012unsigned int HashUtil::hash0(double key, int modulus)
1013{
1014 BSLS_ASSERT(0 < modulus);
1015
1016 bsls::Types::Int64 *v = reinterpret_cast<bsls::Types::Int64 *>(&key);
1017 return HashUtil::hash0(*v, modulus);
1018}
1019
1020inline
1021unsigned int HashUtil::hash0(float key, int modulus)
1022{
1023 BSLS_ASSERT(0 < modulus);
1024
1025 return HashUtil::hash0(static_cast<double>(key), modulus);
1026}
1027
1028inline
1029unsigned int HashUtil::hash0(const void *key, int modulus)
1030{
1031 BSLS_ASSERT(0 < modulus);
1032
1033 if (4 == sizeof(void *)) {
1034 const int *v = reinterpret_cast<const int *>(&key);
1035 return HashUtil::hash0(*v, modulus); // RETURN
1036 }
1037 else {
1038 const bsls::Types::Int64 *v = static_cast<const bsls::Types::Int64 *>(
1039 static_cast<const void *>(&key));
1040 return HashUtil::hash0(*v, modulus); // RETURN
1041 }
1042}
1043
1044} // close package namespace
1045
1046
1047#endif
1048
1049// ----------------------------------------------------------------------------
1050// Copyright 2015 Bloomberg Finance L.P.
1051//
1052// Licensed under the Apache License, Version 2.0 (the "License");
1053// you may not use this file except in compliance with the License.
1054// You may obtain a copy of the License at
1055//
1056// http://www.apache.org/licenses/LICENSE-2.0
1057//
1058// Unless required by applicable law or agreed to in writing, software
1059// distributed under the License is distributed on an "AS IS" BASIS,
1060// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1061// See the License for the specific language governing permissions and
1062// limitations under the License.
1063// ----------------------------- END-OF-FILE ----------------------------------
1064
1065/** @} */
1066/** @} */
1067/** @} */
#define BSLS_ASSERT(X)
Definition bsls_assert.h:1804
#define BSLS_IDENT(str)
Definition bsls_ident.h:195
Definition bdlb_algorithmworkaroundutil.h:74
This struct provides a namespace for hash functions.
Definition bdlb_hashutil.h:782
static unsigned int hash2(unsigned int key)
static unsigned int hash1(float key)
static unsigned int hash0(char key, int modulus)
Definition bdlb_hashutil.h:922
static unsigned int hash2(signed char key)
static unsigned int hash2(float key)
static unsigned int hash0(const char *string, int modulus)
static unsigned int hash2(unsigned short key)
static unsigned int hash0(const char *string, int stringLength, int modulus)
static unsigned int hash1(const void *key)
static unsigned int hash1(unsigned short key)
static unsigned int hash1(bsls::Types::Uint64 key)
static unsigned int hash2(unsigned char key)
static unsigned int hash2(const void *key)
static unsigned int hash2(int key)
static unsigned int hash1(double key)
static unsigned int hash1(signed char key)
static unsigned int hash2(char key)
static unsigned int hash2(bsls::Types::Int64 key)
static unsigned int hash2(bsls::Types::Uint64 key)
static unsigned int hash1(int key)
static unsigned int hash1(unsigned char key)
static unsigned int hash1(unsigned int key)
static unsigned int hash1(char key)
static unsigned int hash2(double key)
static unsigned int hash1(short key)
static unsigned int hash1(unsigned long key)
static unsigned int hash2(const char *data, int length)
static unsigned int hash1(const char *data, int length)
static unsigned int hash2(short key)
static unsigned int hash2(unsigned long key)
static unsigned int hash1(bsls::Types::Int64 key)
static unsigned int hash1(long key)
static unsigned int hash2(long key)
unsigned long long Uint64
Definition bsls_types.h:137
long long Int64
Definition bsls_types.h:132