C++ implementation of Constant Database (CDB++)

http://www.chokkan.org/software/cdbpp/

#include <fstream>#include <iomanip>#include <iostream>#include <sstream>#include <string>#include <cdbpp.h>#define N       100000#define DBNAME  "sample.cdb"// Convert an integer to its string representation. std::string int2str(int i) {     std::stringstream ss;     ss << std::setfill('0') << std::setw(6) << i;     return ss.str(); }  bool build() {     // Open a database file for writing (with binary mode).     std::ofstream ofs(DBNAME, std::ios_base::binary);     if (ofs.fail()) {         std::cerr << "ERROR: Failed to open a database file." << std::endl;         return false;     }      try {         // Create an instance of CDB++ writer.         cdbpp::builder dbw(ofs);          // Insert key/value pairs to the CDB++ writer.         for (int i = 0;i < N;++i) {             std::string key = int2str(i);             dbw.put(key.c_str(), key.length(), &i, sizeof(i));         }          // Destructing the CDB++ writer flushes the database to the stream.      } catch (const cdbpp::builder_exception& e) {         // Abort if something went wrong...         std::cerr << "ERROR: " << e.what() << std::endl;         return false;     }      return true; }  bool read() {     // Open the database file for reading (with binary mode).     std::ifstream ifs(DBNAME, std::ios_base::binary);     if (ifs.fail()) {         std::cerr << "ERROR: Failed to open a database file." << std::endl;         return false;     }      try {         // Open the database from the input stream.         cdbpp::cdbpp dbr(ifs);         if (!dbr.is_open()) {             std::cerr << "ERROR: Failed to read a database file." << std::endl;             return false;         }          // Issue queries to the database.         for (int i = 0;i < N;++i) {             size_t vsize;             std::string key = int2str(i);             const int *value = (const int*)dbr.get(key.c_str(), key.length(), &vsize);             if (value == NULL) {                 std::cerr << "ERROR: The key is not found." << std::endl;                 return false;             }              if (vsize != sizeof(int) || *value != i) {                 std::cerr << "ERROR: The key value is wrong." << std::endl;                 return false;             }         }      } catch (const cdbpp::cdbpp_exception& e) {         // Abort if something went wrong...         std::cerr << "ERROR: " << e.what() << std::endl;         return false;     }      return true; }  int main(int argc, char *argv[]) {     // Build a sample database and test it.     bool b = build() && read();     std::cout << (b ? "OK" : "FAIL") << std::endl;     return b ? 0 : 1; }

/*  *      C++ implementation of Constant Database (CDB++)  *  * Copyright (c) 2008,2009, Naoaki Okazaki  * All rights reserved.  *  * Redistribution and use in source and binary forms, with or without  * modification, are permitted provided that the following conditions are met:  *     * Redistributions of source code must retain the above copyright  *       notice, this list of conditions and the following disclaimer.  *     * Redistributions in binary form must reproduce the above copyright  *       notice, this list of conditions and the following disclaimer in the  *       documentation and/or other materials provided with the distribution.  *     * Neither the name of the authors nor the names of its contributors may  *       be used to endorse or promote products derived from this software  *       without specific prior written permission.  *  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */  /* $Id: cdbpp.h 14 2009-07-13 16:39:27Z naoaki $ */  #ifndef __CDBPP_H__ #define __CDBPP_H__  #include <cstring> #include <fstream> #include <functional> #include <iostream> #include <vector> #include <stdint.h> #include <stdexcept>  namespace cdbpp {  /**   * \addtogroup cdbpp_api CDB++ API  * @{  *  *  The CDB++ API.  */  // Global constants. enum {     // Version number.     VERSION = 1,     // The number of hash tables.     NUM_TABLES = 256,     // A constant for byte-order checking.     BYTEORDER_CHECK = 0x62445371, };     /**  * MurmurHash2.  *  *  This code makes the following assumption about how your machine behaves  *      - We can read a 4-byte value from any address without crashing.  *  *  It also has a few limitations:  *      - It will not work incrementally.  *      - It will not produce the same results on little-endian and big-endian  *        machines.  *  *  @author Austin Appleby  */ class murmurhash2 :     public std::binary_function<const void *, size_t, uint32_t> { protected:     inline static uint32_t get32bits(const char *d)     {         return *reinterpret_cast<const uint32_t*>(d);     }  public:     inline uint32_t operator() (const void *key, size_t size) const     {         // 'm' and 'r' are mixing constants generated offline.         // They're not really 'magic', they just happen to work well.          const uint32_t m = 0x5bd1e995;         const int32_t r = 24;          // Initialize the hash to a 'random' value          const uint32_t seed = 0x87654321;         uint32_t h = seed ^ size;          // Mix 4 bytes at a time into the hash          const char * data = (const char *)key;          while (size >= 4)         {             uint32_t k = get32bits(data);              k *= m;              k ^= k >> r;              k *= m;                           h *= m;              h ^= k;              data += 4;             size -= 4;         }                  // Handle the last few bytes of the input array          switch (size)         {         case 3: h ^= data[2] << 16;         case 2: h ^= data[1] << 8;         case 1: h ^= data[0];                 h *= m;         };          // Do a few final mixes of the hash to ensure the last few         // bytes are well-incorporated.          h ^= h >> 13;         h *= m;         h ^= h >> 15;          return h;     }  };     struct tableref_t {     uint32_t    offset;     // Offset to a hash table.     uint32_t    num;        // Number of elements in the hash table. };   static uint32_t get_data_begin() {     return (16 + sizeof(tableref_t) * NUM_TABLES); }    /**  * Exception class for the CDB++ builder.  */ class builder_exception : public std::invalid_argument { public:     builder_exception(const std::string& msg)         : std::invalid_argument(msg)     {     } };    /**  * CDB++ builder.  */ template <typename hash_function> class builder_base { protected:     // A bucket structure.     struct bucket     {         uint32_t    hash;       // Hash value of the record.         uint32_t    offset;     // Offset address to the actual record.          bucket() : hash(0), offset(0)         {         }          bucket(uint32_t h, uint32_t o) : hash(h), offset(o)         {         }     };      // A hash table is a vector of buckets.     typedef std::vector<bucket> hashtable;  protected:     std::ofstream&  m_os;               // Output stream.     uint32_t        m_begin;     uint32_t        m_cur;     hashtable       m_ht[NUM_TABLES];   // Hash tables.  public:     /**      * Constructs an object.      *  @param  os          The output stream to which this class write the      *                      database. This stream must be opened in the      *                      binary mode (\c std::ios_base::binary).      */     builder_base(std::ofstream& os) : m_os(os)     {         m_begin = m_os.tellp();         m_cur = get_data_begin();         m_os.seekp(m_begin + m_cur);     }      /**      * Destructs an object.      */     virtual ~builder_base()     {         this->close();     }      /**      * Inserts a pair of key and value to the database.      *  Any key in the database should be unique, but this library does not      *  check duplicated keys.      *  @param  key         The pointer to the key.      *  @param  ksize       The size of the key.      *  @param  value       The pointer to the value.      *  @param  vsize       The size of the value.      */     template <class key_t, class value_t>     void put(const key_t *key, size_t ksize, const value_t *value, size_t vsize)     {         // Write out the current record.         write_uint32((uint32_t)ksize);         m_os.write(reinterpret_cast<const char *>(key), ksize);         write_uint32((uint32_t)vsize);         m_os.write(reinterpret_cast<const char *>(value), vsize);          // Compute the hash value and choose a hash table.         uint32_t hv = hash_function()(static_cast<const void *>(key), ksize);         hashtable& ht = m_ht[hv % NUM_TABLES];          // Store the hash value and offset to the hash table.         ht.push_back(bucket(hv, m_cur));          // Increment the current position.         m_cur += sizeof(uint32_t) + ksize + sizeof(uint32_t) + vsize;     }  protected:     void close()     {         // Check the consistency of the stream offset.         if (m_begin + m_cur != (uint32_t)m_os.tellp()) {             throw builder_exception("Inconsistent stream offset");         }          // Store the hash tables. At this moment, the file pointer refers to         // the offset succeeding the last key/value pair.         for (size_t i = 0;i < NUM_TABLES;++i) {             hashtable& ht = m_ht[i];              // Do not write an empty hash table.             if (!ht.empty()) {                 // An actual table will have the double size; half elements                 // in the table are kept empty.                 int n = ht.size() * 2;                  // Allocate the actual table.                 bucket* dst = new bucket[n];                  // Put hash elements to the table with the open-address method.                 typename hashtable::const_iterator it;                 for (it = ht.begin();it != ht.end();++it) {                     int k = (it->hash >> 8) % n;                      // Find a vacant element.                     while (dst[k].offset != 0) {                         k = (k+1) % n;                     }                      // Store the hash element.                     dst[k].hash = it->hash;                     dst[k].offset = it->offset;                 }                  // Write out the new table.                 for (int k = 0;k < n;++k) {                     write_uint32(dst[k].hash);                     write_uint32(dst[k].offset);                 }                  // Free the table.                 delete[] dst;             }         }          // Store the current position.         uint32_t offset = (uint32_t)m_os.tellp();          // Rewind the stream position to the beginning.         m_os.seekp(m_begin);          // Write the file header.         char chunkid[4] = {'C','D','B','+'};         m_os.write(chunkid, 4);         write_uint32(offset - m_begin);         write_uint32(VERSION);         write_uint32(BYTEORDER_CHECK);          // Write references to hash tables. At this moment, dbw->cur points         // to the offset succeeding the last key/data pair.          for (size_t i = 0;i < NUM_TABLES;++i) {             // Offset to the hash table (or zero for non-existent tables).             write_uint32(m_ht[i].empty() ? 0 : m_cur);             // Bucket size is double to the number of elements.             write_uint32(m_ht[i].size() * 2);             // Advance the offset counter.             m_cur += sizeof(uint32_t) * 2 * m_ht[i].size() * 2;         }          // Seek to the last position.         m_os.seekp(offset);     }      inline void write_uint32(uint32_t value)     {         m_os.write(reinterpret_cast<const char *>(&value), sizeof(value));     } };    /**  * Exception class for the CDB++ reader.  */ class cdbpp_exception : public std::invalid_argument { public:     cdbpp_exception(const std::string& msg)         : std::invalid_argument(msg)     {     } };  /**  * CDB++ reader.  */ template <typename hash_function> class cdbpp_base { protected:     struct bucket_t     {         uint32_t    hash;           // Hash value of the record.         uint32_t    offset;         // Offset address to the actual record.     };       struct hashtable_t     {         uint32_t        num;            // Number of elements in the table.         const bucket_t* buckets;        // Buckets (array of bucket).     };   protected:     const uint8_t*  m_buffer;           // Pointer to the memory block.     size_t          m_size;             // Size of the memory block.     bool            m_own;              //       hashtable_t     m_ht[NUM_TABLES];   // Hash tables.     size_t          m_n;  public:     /**      * Constructs an object.      */     cdbpp_base()         : m_buffer(NULL), m_size(0), m_own(false), m_n(0)     {     }      /**      * Constructs an object by opening a database on memory.      *  @param  buffer      The pointer to the memory image of the database.      *  @param  size        The size of the memory image.      *  @param  own         If this is set to \c true, this library will call      *                      delete[] when the database is closed.      */     cdbpp_base(const void *buffer, size_t size, bool own)         : m_buffer(NULL), m_size(0), m_own(false), m_n(0)     {         this->open(buffer, size, own);     }      /**      * Constructs an object by opening a database from an input stream.      *  @param  ifs         The input stream from which this library reads      *                      a database.      */     cdbpp_base(std::ifstream& ifs)         : m_buffer(NULL), m_size(0), m_own(false), m_n(0)     {         this->open(ifs);     }      /**      * Destructs the object.      */     virtual ~cdbpp_base()     {         close();     }      /**      * Tests if the database is opened.      *  @return bool        \c true if the database is opened,      *                      \c false otherwise.      */     bool is_open() const     {         return (m_buffer != NULL);     }      /**      * Obtains the number of elements in the database.      *  @return size_t          The number of elements.      */     size_t size() const     {         return m_n;     }      /**      * Tests if the database is empty.      *  @return bool        \c true if the number of records is zero,      *                      \c false otherwise.      */     bool empty() const     {         return (m_n == 0);     }      /**      * Opens the database from an input stream.      *  @param  ifs         The input stream from which this library reads      *                      a database.      */     size_t open(std::ifstream& ifs)     {         char chunk[4], size[4];         std::istream::pos_type offset = ifs.tellg();          do {             // Read a chunk identifier.             ifs.read(chunk, 4);             if (ifs.fail()) {                 break;             }              // Check the chunk identifier.             if (std::strncmp(chunk, "CDB+", 4) != 0) {                 break;             }              // Read the size of the chunk.             ifs.read(size, 4);             if (ifs.fail()) {                 break;             }              // Allocate a memory block for the chunk.             uint32_t chunk_size = read_uint32(reinterpret_cast<uint8_t*>(size));             uint8_t* block = new uint8_t[chunk_size];              // Read the memory image from the stream.             ifs.seekg(0, std::ios_base::beg);             if (ifs.fail()) {                 break;             }             ifs.read(reinterpret_cast<char*>(block), chunk_size);             if (ifs.fail()) {                 break;             }              return this->open(block, chunk_size, true);          } while (0);          ifs.seekg(offset, std::ios::beg);         return 0;     }      /**      * Opens the database from a memory image.      *  @param  buffer      The pointer to the memory image of the database.      *  @param  size        The size of the memory image.      *  @param  own         If this is set to \c true, this library will call      *                      delete[] when the database is closed.      */     size_t open(const void *buffer, size_t size, bool own = false)     {         const uint8_t *p = reinterpret_cast<const uint8_t*>(buffer);          // Make sure that the size of the chunk is larger than the minimum size.         if (size < get_data_begin()) {             throw cdbpp_exception("The memory image is smaller than a chunk header.");         }          // Check the chunk identifier.         if (memcmp(p, "CDB+", 4) != 0) {             throw cdbpp_exception("Incorrect chunk header");         }         p += 4;          // Read the chunk header.         uint32_t csize = read_uint32(p);         p += sizeof(uint32_t);         uint32_t version = read_uint32(p);         p += sizeof(uint32_t);         uint32_t byteorder = read_uint32(p);         p += sizeof(uint32_t);          // Check the byte-order consistency.         if (byteorder != BYTEORDER_CHECK) {             throw cdbpp_exception("Inconsistent byte order");         }         // Check the chunk size.         if (size < csize) {             throw cdbpp_exception("The memory image is smaller than a chunk size.");         }          // Set memory block and size.         m_buffer = reinterpret_cast<const uint8_t*>(buffer);         m_size = size;         m_own = own;          // Set pointers to the hash tables.         m_n = 0;         const tableref_t* ref = reinterpret_cast<const tableref_t*>(p);         for (size_t i = 0;i < NUM_TABLES;++i) {             if (ref[i].offset) {                 // Set the buckets.                 m_ht[i].buckets = reinterpret_cast<const bucket_t*>(m_buffer + ref[i].offset);                 m_ht[i].num = ref[i].num;             } else {                 // An empty hash table.                 m_ht[i].buckets = NULL;                 m_ht[i].num = 0;             }              // The number of records is the half of the table size.             m_n += (ref[i].num / 2);         }          return (size_t)csize;     }      /**      * Closes the database.      */     void close()     {         if (m_own && m_buffer != NULL) {             delete[] m_buffer;         }         m_buffer = NULL;         m_size = 0;         m_n = 0;     }      /**      * Finds the key in the database.      *  @param  key         The pointer to the key.      *  @param  ksize       The size of the key.      *  @param  vsize       The pointer of a variable to which the size of the      *                      value returned. This parameter can be \c NULL.      *  @return const void* The pointer to the value.      */     const void* get(const void *key, size_t ksize, size_t* vsize) const     {         uint32_t hv = hash_function()(key, ksize);         const hashtable_t* ht = &m_ht[hv % NUM_TABLES];          if (ht->num && ht->buckets != NULL) {             int n = ht->num;             int k = (hv >> 8) % n;             const bucket_t* p = NULL;              while (p = &ht->buckets[k], p->offset) {                 if (p->hash == hv) {                     const uint8_t *q = m_buffer + p->offset;                     if (read_uint32(q) == ksize &&                         memcmp(key, q + sizeof(uint32_t), ksize) == 0) {                         q += sizeof(uint32_t) + ksize;                         if (vsize != NULL) {                             *vsize = read_uint32(q);                         }                         return q + sizeof(uint32_t);                     }                 }                 k = (k+1) % n;             }         }          if (vsize != NULL) {             *vsize = 0;         }         return NULL;     }  protected:     inline uint32_t read_uint32(const uint8_t* p) const     {         return *reinterpret_cast<const uint32_t*>(p);     } };  /// CDB++ builder with MurmurHash2. typedef builder_base<murmurhash2> builder; /// CDB++ reader with MurmurHash2. typedef cdbpp_base<murmurhash2> cdbpp;   };  /** @} */  /** @mainpage C++ implementation of Constant Database (CDB++)  @section intro Introduction  Constant Database PlusPlus (CDB++) is a C++ implementation of hash database specialized for serialization and retrieval of <i>static</i> associations between keys and their values. The database provides several features: - <b>Fast look-ups.</b> This library implements the data structure of the   <a href="http://cr.yp.to/cdb.html">Constant Database</a> proposed by   Daniel J. Bernstein. - <b>Low footprint.</b> A CDB++ database consists of a chunk header (16 bytes),   hash tables (2048 bytes and 16 bytes per record), and actual records (8 bytes   plus key/value size per record). - <b>Fast hash function.</b> CDB++ incorporates the fast and   collision-resistant hash function for strings   (<a href="http://murmurhash.googlepages.com/">MurmurHash 2.0</a>)   implemented by Austin Appleby. - <b>Chunk format.</b> The structure of CDB++ is designed to store the data in   a chunk of a file; CDB++ database can be embedded into a file with other   arbitrary data. - <b>Simple write interface.</b> CDB++ can serialize a hash database to C++   output streams (\c std::ostream). - <b>Simple read interface.</b> CDB++ can prepare a hash database from an input   stream (\c std::istream) or from a memory block on which a database image is   read or memory-mapped from a file. - <b>Cross platform.</b> The source code can be compiled on Microsoft Visual   Studio 2008, GNU C Compiler (gcc), etc. - <b>Very simple API.</b> The CDB++ API exposes only a few functions; one can   use this library just by looking at the sample code. - <b>Single C++ header implementation.</b> CDB++ is implemented in a single   header file (cdbpp.h); one can use the CDB++ API only by including cdbpp.h   in a source code.  CDB++ does not support these for simplicity: - modifying associations - checking collisions in keys - compatibility of the database format on different byte-order architectures  @section sample Sample code This sample code constructs a database "test.cdb" with 100,000 string/integer associations, "000000"/0, "000001"/1, ..., "100000"/100000 (in build function). Then the code issues string queries "000000", ..., "100000", and checks whether the values are correct (in read function).  @include sample.cpp  @section download Download  - <a href="http://www.chokkan.org/software/dist/cdbpp-1.1.tar.gz">Source code</a>  CDB++ is distributed under the term of the <a href="http://www.opensource.org/licenses/bsd-license.php">modified BSD license</a>.  @section changelog History - Version 1.1 (2009-07-14):     - Fixed a compile issue (a patch submitted by Takashi Imamichi).     - Replaced SuperFastHash with MurmurHash 2.0 (a patch submitted by       Takashi Imamichi).     - Classes cdbpp::builder_base and cdbpp::cdbpp_base taking a template       argument to configure a hash function. Classes cdbpp::builder and       cdbpp::cdbpp are now the synonyms of       \c cdbpp::builder_base<cdbpp::murmurhash2> and       \c cdbpp::cdbpp_base<cdbpp::murmurhash2>, respectively.     - Split the sample code into build and read functions. - Version 1.0 (2009-07-09):     - Initial release.  @section api Documentation  - @ref cdbpp_api "CDB++ API"  @section acknowledgements Acknowledgements  The data structure of the constant database was originally proposed by <a href="http://cr.yp.to/djb.html">Daniel J. Bernstein</a>.  The source code of CDB++ includes the <a href="http://murmurhash.googlepages.com/">MurmurHash 2.0</a> implemented by Austin Appleby.  The CDB++ distribution contains "a portable stdint.h", which is released by <a href="http://www.azillionmonkeys.com/qed/">Paul Hsieh</a> under the term of the modified BSD license, for addressing the compatibility issue of Microsoft Visual Studio 2008. The original code is available at: http://www.azillionmonkeys.com/qed/pstdint.h  @section reference References - <a href="http://cr.yp.to/cdb.html">cdb</a> by Daniel J. Bernstein. - <a href="http://www.corpit.ru/mjt/tinycdb.html">TinyCDB - a Constant DataBase</a> by Michael Tokarev. - <a href="http://cdbxx.sourceforge.net/">Constant Database C++ Bindings</a> by Stanislav Ievlev. - <a href="http://www.unixuser.org/~euske/doc/cdbinternals/index.html">Constant Database (cdb) Internals</a> by Yusuke Shinyama. - <a href="http://murmurhash.googlepages.com/">MurmurHash 2.0</a> by Austin Appleby.  */  #endif/*__CDBPP_H__*/