MappedHash.h [plain text]

//
//  MappedHash.h
//

#ifndef liblldb_MappedHash_h_
#define liblldb_MappedHash_h_

#include <assert.h>
#include <stdint.h>

#include <map>
#include <vector>

#include "lldb/Core/DataExtractor.h"
#include "lldb/Core/Stream.h"

class MappedHash
{
public:

    enum HashFunctionType
    {
        eHashFunctionDJB        = 0u    // Daniel J Bernstein hash function that is also used by the ELF GNU_HASH sections
    };


    static uint32_t
    HashStringUsingDJB (const char *s)
    {
        uint32_t h = 5381;
        
        for (unsigned char c = *s; c; c = *++s)
            h = ((h << 5) + h) + c;
        
        return h;
    }
    
    static uint32_t
    HashString (const uint8_t hash_function, const char *s)
    {
        switch (hash_function)
        {
            case MappedHash::eHashFunctionDJB:
                return HashStringUsingDJB (s);
                
            default:
                break;
        }
        assert (!"Invalid hash function index");
        return 0;
    }


    static const uint32_t HASH_MAGIC = 0x48415348u;
    static const uint32_t HASH_CIGAM = 0x48534148u;
    
    template <typename T>
    struct Header
	{
        typedef T HeaderData;

        uint32_t    magic;             // HASH_MAGIC or HASH_CIGAM magic value to allow endian detection        
        uint16_t    version;           // Version number
		uint16_t    hash_function;     // The hash function enumeration that was used
		uint32_t    bucket_count;      // The number of buckets in this hash table
		uint32_t    hashes_count;      // The total number of unique hash values and hash data offsets in this table
        uint32_t    header_data_len;   // The size in bytes of the "header_data" template member below
        HeaderData  header_data;       // 
        
		Header () :
            magic (HASH_MAGIC),
            version (1),
            hash_function (eHashFunctionDJB),
            bucket_count (0),
            hashes_count (0),
            header_data_len (sizeof(T)),
            header_data ()
		{
		}
        
        virtual 
        ~Header()
        {
        }

        size_t
        GetByteSize() const
        {
            return  sizeof(magic) + 
                    sizeof(version) + 
                    sizeof(hash_function) +
                    sizeof(bucket_count) +
                    sizeof(hashes_count) +
                    sizeof(header_data_len) +
                    header_data_len;
        }
        
        virtual size_t
        GetByteSize (const HeaderData &header_data) = 0;

        void
        SetHeaderDataByteSize (uint32_t header_data_byte_size)
        {
            header_data_len = header_data_byte_size;
        }

        void
        Dump (lldb_private::Stream &s)
        {
            s.Printf ("header.magic              = 0x%8.8x\n", magic);
            s.Printf ("header.version            = 0x%4.4x\n", version);
            s.Printf ("header.hash_function      = 0x%4.4x\n", hash_function);
            s.Printf ("header.bucket_count       = 0x%8.8x %u\n", bucket_count, bucket_count);
            s.Printf ("header.hashes_count       = 0x%8.8x %u\n", hashes_count, hashes_count);
            s.Printf ("header.header_data_len    = 0x%8.8x %u\n", header_data_len, header_data_len);
        }
        
        virtual uint32_t
        Read (lldb_private::DataExtractor &data, uint32_t offset)
        {
            if (data.ValidOffsetForDataOfSize (offset, 
                                               sizeof (magic) + 
                                               sizeof (version) + 
                                               sizeof (hash_function) +
                                               sizeof (bucket_count) +
                                               sizeof (hashes_count) +
                                               sizeof (header_data_len)))
            {
                magic = data.GetU32 (&offset);
                if (magic != HASH_MAGIC)
                {
                    if (magic == HASH_CIGAM)
                    {
                        switch (data.GetByteOrder())
                        {
                            case lldb::eByteOrderBig:
                                data.SetByteOrder(lldb::eByteOrderLittle);
                                break;
                            case lldb::eByteOrderLittle:
                                data.SetByteOrder(lldb::eByteOrderBig);
                                break;
                            default:
                                return UINT32_MAX;
                        }
                    }
                    else
                    {
                        // Magic bytes didn't match
                        version = 0;
                        return UINT32_MAX;
                    }
                }
                
                version = data.GetU16 (&offset);
                if (version != 1)
                {
                    // Unsupported version
                    return UINT32_MAX;
                }
                hash_function       = data.GetU16 (&offset);
                if (hash_function == 4)
                    hash_function = 0; // Deal with pre-release version of this table...
                bucket_count        = data.GetU32 (&offset);
                hashes_count        = data.GetU32 (&offset);
                header_data_len     = data.GetU32 (&offset);
                return offset;
            }
            return UINT32_MAX;
        }
//        
//        // Returns a buffer that contains a serialized version of this table
//        // that must be freed with free().
//        virtual void *
//        Write (int fd);
	};
    
    template <typename __KeyType, class __HeaderDataType, class __ValueType>
    class ExportTable
    {
    public:
        typedef __HeaderDataType HeaderDataType;
        typedef Header<HeaderDataType> HeaderType;
        typedef __KeyType KeyType;
        typedef __ValueType ValueType;

        struct Entry
        {
            uint32_t    hash;
            KeyType     key;
            ValueType   value;
        };
        
        typedef std::vector<ValueType> ValueArrayType;

        typedef std::map<KeyType, ValueArrayType> HashData;
        // Map a name hash to one or more name infos
        typedef std::map<uint32_t, HashData> HashToHashData;

        virtual KeyType
        GetKeyForStringType (const char *cstr) const = 0;
        
        virtual size_t
        GetByteSize (const HashData &key_to_key_values) = 0;

        virtual bool
        WriteHashData (const HashData &hash_data,
                       lldb_private::Stream &ostrm) = 0;
//
        void
        AddEntry (const char *cstr, const ValueType &value)
        {
            Entry entry;
            entry.hash = MappedHash::HashString (eHashFunctionDJB, cstr);
            entry.key = GetKeyForStringType (cstr);
            entry.value = value;
            m_entries.push_back (entry);
        }

        void
        Save (const HeaderDataType &header_data,
              lldb_private::Stream &ostrm)
        {
            if (m_entries.empty())
                return;
            
            const uint32_t num_entries = m_entries.size();
            uint32_t i = 0;
            
            HeaderType header;
            
            header.magic = HASH_MAGIC;
            header.version = 1;
            header.hash_function = eHashFunctionDJB;
            header.bucket_count = 0;
            header.hashes_count = 0;
            header.prologue_length = header_data.GetByteSize();
            
            // We need to figure out the number of unique hashes first before we can
            // calculate the number of buckets we want to use.
            typedef std::vector<uint32_t> hash_coll;
            hash_coll unique_hashes;
            unique_hashes.resize (num_entries);
            for (i=0; i<num_entries; ++i)
                unique_hashes[i] = m_entries[i].hash;
            std::sort (unique_hashes.begin(), unique_hashes.end());
            hash_coll::iterator pos = std::unique (unique_hashes.begin(), unique_hashes.end());
            const size_t num_unique_hashes = std::distance (unique_hashes.begin(), pos);
            
            if (num_unique_hashes > 1024)
                header.bucket_count = num_unique_hashes/4;
            else if (num_unique_hashes > 16)
                header.bucket_count = num_unique_hashes/2;
            else
                header.bucket_count = num_unique_hashes;
            if (header.bucket_count == 0)
                header.bucket_count = 1;
            
            
            std::vector<HashToHashData> hash_buckets;
            std::vector<uint32_t> hash_indexes (header.bucket_count, 0);
            std::vector<uint32_t> hash_values;
            std::vector<uint32_t> hash_offsets;
            hash_buckets.resize (header.bucket_count);
            uint32_t bucket_entry_empties = 0;
            //StreamString hash_file_data(Stream::eBinary, dwarf->GetObjectFile()->GetAddressByteSize(), dwarf->GetObjectFile()->GetByteSize());
            
            // Push all of the hashes into their buckets and create all bucket
            // entries all populated with data.
            for (i=0; i<num_entries; ++i)
            {
                const uint32_t hash = m_entries[i].hash;
                const uint32_t bucket_idx = hash % header.bucket_count;
                const uint32_t strp_offset = m_entries[i].str_offset;
                const dw_offset_t die_offset = m_entries[i].die_offset;
                hash_buckets[bucket_idx][hash][strp_offset].push_back(die_offset);
            }
            
            // Now for each bucket we write the bucket value which is the
            // number of hashes and the hash index encoded into a single 
            // 32 bit unsigned integer.
            for (i=0; i<header.bucket_count; ++i)
            {
                HashToHashData &bucket_entry = hash_buckets[i];
                
                if (bucket_entry.empty())
                {
                    // Empty bucket
                    ++bucket_entry_empties;
                    hash_indexes[i] = UINT32_MAX;
                }
                else
                {
                    const uint32_t hash_value_index = hash_values.size();
                    uint32_t hash_count = 0;
                    typename HashToHashData::const_iterator pos, end = bucket_entry.end();
                    for (pos = bucket_entry.begin(); pos != end; ++pos)
                    {
                        hash_values.push_back (pos->first);
                        hash_offsets.push_back (GetByteSize (pos->second));
                        ++hash_count;
                    }
                    
                    hash_indexes[i] = hash_value_index;
                }
            }
            header.hashes_count = hash_values.size();
            
            // Write the header out now that we have the hash_count
            header.Write (ostrm);
            
            // Now for each bucket we write the start index of the hashes
            // for the current bucket, or UINT32_MAX if the bucket is empty
            for (i=0; i<header.bucket_count; ++i)
            {
                ostrm.PutHex32(hash_indexes[i]);
            }
            
            // Now we need to write out all of the hash values
            for (i=0; i<header.hashes_count; ++i)
            {
                ostrm.PutHex32(hash_values[i]);
            }

            // Now we need to write out all of the hash data offsets,
            // there is an offset for each hash in the hashes array
            // that was written out above
            for (i=0; i<header.hashes_count; ++i)
            {
                ostrm.PutHex32(hash_offsets[i]);
            }

            // Now we write the data for each hash and verify we got the offset
            // correct above...
            for (i=0; i<header.bucket_count; ++i)
            {
                HashToHashData &bucket_entry = hash_buckets[i];
                
                typename HashToHashData::const_iterator pos, end = bucket_entry.end();
                for (pos = bucket_entry.begin(); pos != end; ++pos)
                {
                    if (!bucket_entry.empty())
                    {
                        WriteHashData (pos->second);
                    }
                }
            }
        }
    protected:
        typedef std::vector<Entry> collection;
        collection m_entries;
    };
    // A class for reading and using a saved hash table from a block of data
    // in memory
    template <typename __KeyType, class __HeaderType, class __HashData>
    class MemoryTable
    {
    public:
        typedef __HeaderType HeaderType;
        typedef __KeyType KeyType;
        typedef __HashData HashData;

        enum Result
        {
            eResultKeyMatch         = 0u, // The entry was found, key matched and "pair" was filled in successfully
            eResultKeyMismatch      = 1u, // Bucket hash data collision, but key didn't match
            eResultEndOfHashData    = 2u, // The chain of items for this hash data in this bucket is terminated, search no more
            eResultError            = 3u  // Error parsing the hash data, abort
        };

        struct Pair
        {
            KeyType key;
            HashData value;
        };

        MemoryTable (lldb_private::DataExtractor &data) :
            m_header (),
            m_hash_indexes (NULL),
            m_hash_values (NULL),
            m_hash_offsets (NULL)
        {
            uint32_t offset = m_header.Read (data, 0);
            if (offset != UINT32_MAX && IsValid ())
            {
                m_hash_indexes = (uint32_t *)data.GetData (&offset, m_header.bucket_count * sizeof(uint32_t));
                m_hash_values  = (uint32_t *)data.GetData (&offset, m_header.hashes_count * sizeof(uint32_t));
                m_hash_offsets = (uint32_t *)data.GetData (&offset, m_header.hashes_count * sizeof(uint32_t));
            }
        }
        
        virtual
        ~MemoryTable ()
        {
        }
        
        bool
        IsValid () const
        {
            return m_header.version == 1 && 
                   m_header.hash_function == eHashFunctionDJB && 
                   m_header.bucket_count > 0 &&
                   m_header.hashes_count > 0;
        }
        
        uint32_t
        GetHashIndex (uint32_t bucket_idx) const
        {
            if (m_hash_indexes && bucket_idx < m_header.bucket_count)
                return m_hash_indexes[bucket_idx];
            return UINT32_MAX;
        }
        
        uint32_t
        GetHashValue (uint32_t hash_idx) const
        {
            if (m_hash_values && hash_idx < m_header.hashes_count)
                return m_hash_values[hash_idx];
            return UINT32_MAX;
        }
        
        uint32_t
        GetHashDataOffset (uint32_t hash_idx) const
        {
            if (m_hash_offsets && hash_idx < m_header.hashes_count)
                return m_hash_offsets[hash_idx];
            return UINT32_MAX;
        }
        
        bool
        Find (const char *name, Pair &pair) const
        {
            if (IsValid ())
            {
                const uint32_t bucket_count = m_header.bucket_count;
                const uint32_t hash_count = m_header.hashes_count;
                const uint32_t hash_value = MappedHash::HashString (m_header.hash_function, name);
                const uint32_t bucket_idx = hash_value % bucket_count;
                uint32_t hash_idx = GetHashIndex (bucket_idx);
                if (hash_idx < hash_count)
                {
                    for (; hash_idx < hash_count; ++hash_idx)
                    {
                        const uint32_t curr_hash_value = GetHashValue (hash_idx);
                        if (curr_hash_value == hash_value)
                        {
                            uint32_t hash_data_offset = GetHashDataOffset (hash_idx);
                            while (hash_data_offset != UINT32_MAX)
                            {
                                const uint32_t prev_hash_data_offset = hash_data_offset;
                                Result hash_result = GetHashDataForName (name, &hash_data_offset, pair);
                                // Check the result of getting our hash data
                                switch (hash_result)
                                {
                                case eResultKeyMatch:
                                    return true;

                                case eResultKeyMismatch:
                                    if (prev_hash_data_offset == hash_data_offset)
                                        return false;
                                    break;

                                case eResultEndOfHashData:
                                    // The last HashData for this key has been reached, stop searching
                                    return false;
                                case eResultError:
                                    // Error parsing the hash data, abort
                                    return false;
                                }
                            }
                        }
                        if ((curr_hash_value % bucket_count) != bucket_idx)
                            break;
                    }
                }
            }
            return false;
        }

        // This method must be implemented in any subclasses.
        // The KeyType is user specified and must somehow result in a string
        // value. For example, the KeyType might be a string offset in a string
        // table and subclasses can store their string table as a member of the
        // subclass and return a valie "const char *" given a "key". The value
        // could also be a C string pointer, in which case just returning "key"
        // will suffice.

        virtual const char *
        GetStringForKeyType (KeyType key) const = 0;

        // This method must be implemented in any subclasses and it must try to
        // read one "Pair" at the offset pointed to by the "hash_data_offset_ptr"
        // parameter. This offset should be updated as bytes are consumed and
        // a value "Result" enum should be returned. If the "name" matches the
        // full name for the "pair.key" (which must be filled in by this call),
        // then the HashData in the pair ("pair.value") should be extracted and
        // filled in and "eResultKeyMatch" should be returned. If "name" doesn't
        // match this string for the key, then "eResultKeyMismatch" should be
        // returned and all data for the current HashData must be consumed or
        // skipped and the "hash_data_offset_ptr" offset needs to be updated to
        // point to the next HashData. If the end of the HashData objects for
        // a given hash value have been reached, then "eResultEndOfHashData"
        // should be returned. If anything else goes wrong during parsing,
        // return "eResultError" and the corresponding "Find()" function will
        // be canceled and return false.

        virtual Result
        GetHashDataForName (const char *name,
                            uint32_t* hash_data_offset_ptr, 
                            Pair &pair) const = 0;

        const HeaderType &
        GetHeader()
        {
            return m_header;
        }

    protected:
        // Implementation agnostic information
        HeaderType m_header;
        uint32_t *m_hash_indexes;
        uint32_t *m_hash_values;
        uint32_t *m_hash_offsets;
    };
    
};

#endif // #ifndef liblldb_MappedHash_h_