#include "search.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <limits.h>
#include "options.h"
#include "hash-table.h"
#include "config.h"
#define for if (0) ; else for
#if HAVE_DYNAMIC_ARRAY
#define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
#define FREE_DYNAMIC_ARRAY(var)
#else
#define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
#define FREE_DYNAMIC_ARRAY(var) delete[] var
#endif
Search::Search (KeywordExt_List *list)
: _head (list)
{
}
void
Search::prepare ()
{
_total_keys = 0;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
_total_keys++;
_max_key_len = INT_MIN;
_min_key_len = INT_MAX;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (_max_key_len < keyword->_allchars_length)
_max_key_len = keyword->_allchars_length;
if (_min_key_len > keyword->_allchars_length)
_min_key_len = keyword->_allchars_length;
}
if (_min_key_len == 0)
{
fprintf (stderr, "Empty input keyword is not allowed.\n"
"To recognize an empty input keyword, your code should check for\n"
"len == 0 before calling the gperf generated lookup function.\n");
exit (1);
}
if (option[SEVENBIT])
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
const char *k = keyword->_allchars;
for (int i = keyword->_allchars_length; i > 0; k++, i--)
if (!(static_cast<unsigned char>(*k) < 128))
{
fprintf (stderr, "Option --seven-bit has been specified,\n"
"but keyword \"%.*s\" contains non-ASCII characters.\n"
"Try removing option --seven-bit.\n",
keyword->_allchars_length, keyword->_allchars);
exit (1);
}
}
}
unsigned int
Search::compute_alpha_size () const
{
return (option[SEVENBIT] ? 128 : 256);
}
unsigned int *
Search::compute_alpha_unify () const
{
if (option[UPPERLOWER])
{
unsigned int alpha_size = compute_alpha_size();
unsigned int *alpha_unify = new unsigned int[alpha_size];
for (unsigned int c = 0; c < alpha_size; c++)
alpha_unify[c] = c;
for (unsigned int c = 'A'; c <= 'Z'; c++)
alpha_unify[c] = c + ('a'-'A');
return alpha_unify;
}
else
return NULL;
}
void
Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_tuple(positions, alpha_unify);
}
void
Search::delete_selchars () const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->delete_selchars();
}
unsigned int
Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
{
init_selchars_tuple (positions, alpha_unify);
unsigned int count = 0;
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
return count;
}
void
Search::find_positions ()
{
if (option[POSITIONS])
{
_key_positions = option.get_key_positions();
return;
}
unsigned int *alpha_unify = compute_alpha_unify ();
Positions mandatory;
if (!option[DUP])
{
for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
{
KeywordExt *keyword1 = l1->first();
for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
{
KeywordExt *keyword2 = l2->first();
if (keyword1->_allchars_length == keyword2->_allchars_length)
{
int n = keyword1->_allchars_length;
int i;
for (i = 0; i < n - 1; i++)
{
unsigned char c1 = keyword1->_allchars[i];
unsigned char c2 = keyword2->_allchars[i];
if (option[UPPERLOWER])
{
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
}
if (c1 != c2)
break;
}
if (i < n - 1)
{
int j;
for (j = i + 1; j < n; j++)
{
unsigned char c1 = keyword1->_allchars[j];
unsigned char c2 = keyword2->_allchars[j];
if (option[UPPERLOWER])
{
if (c1 >= 'A' && c1 <= 'Z')
c1 += 'a' - 'A';
if (c2 >= 'A' && c2 <= 'Z')
c2 += 'a' - 'A';
}
if (c1 != c2)
break;
}
if (j >= n)
{
if (!mandatory.contains (i))
mandatory.add (i);
}
}
}
}
}
}
int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
Positions current = mandatory;
unsigned int current_duplicates_count =
count_duplicates_tuple (current, alpha_unify);
for (;;)
{
Positions best;
unsigned int best_duplicates_count = UINT_MAX;
for (int i = imax; i >= -1; i--)
if (!current.contains (i))
{
Positions tryal = current;
tryal.add (i);
unsigned int try_duplicates_count =
count_duplicates_tuple (tryal, alpha_unify);
if (try_duplicates_count < best_duplicates_count
|| (try_duplicates_count == best_duplicates_count && i >= 0))
{
best = tryal;
best_duplicates_count = try_duplicates_count;
}
}
if (best_duplicates_count >= current_duplicates_count)
break;
current = best;
current_duplicates_count = best_duplicates_count;
}
for (;;)
{
Positions best;
unsigned int best_duplicates_count = UINT_MAX;
for (int i = imax; i >= -1; i--)
if (current.contains (i) && !mandatory.contains (i))
{
Positions tryal = current;
tryal.remove (i);
unsigned int try_duplicates_count =
count_duplicates_tuple (tryal, alpha_unify);
if (try_duplicates_count < best_duplicates_count
|| (try_duplicates_count == best_duplicates_count && i == -1))
{
best = tryal;
best_duplicates_count = try_duplicates_count;
}
}
if (best_duplicates_count > current_duplicates_count)
break;
current = best;
current_duplicates_count = best_duplicates_count;
}
for (;;)
{
Positions best;
unsigned int best_duplicates_count = UINT_MAX;
for (int i1 = imax; i1 >= -1; i1--)
if (current.contains (i1) && !mandatory.contains (i1))
for (int i2 = imax; i2 >= -1; i2--)
if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
for (int i3 = imax; i3 >= 0; i3--)
if (!current.contains (i3))
{
Positions tryal = current;
tryal.remove (i1);
tryal.remove (i2);
tryal.add (i3);
unsigned int try_duplicates_count =
count_duplicates_tuple (tryal, alpha_unify);
if (try_duplicates_count < best_duplicates_count
|| (try_duplicates_count == best_duplicates_count
&& (i1 == -1 || i2 == -1 || i3 >= 0)))
{
best = tryal;
best_duplicates_count = try_duplicates_count;
}
}
if (best_duplicates_count > current_duplicates_count)
break;
current = best;
current_duplicates_count = best_duplicates_count;
}
_key_positions = current;
if (option[DEBUG])
{
fprintf (stderr, "\nComputed positions: ");
PositionReverseIterator iter = _key_positions.reviterator();
bool seen_lastchar = false;
bool first = true;
for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
{
if (!first)
fprintf (stderr, ", ");
if (i == Positions::LASTCHAR)
seen_lastchar = true;
else
{
fprintf (stderr, "%d", i + 1);
first = false;
}
}
if (seen_lastchar)
{
if (!first)
fprintf (stderr, ", ");
fprintf (stderr, "$");
}
fprintf (stderr, "\n");
}
delete[] alpha_unify;
}
unsigned int
Search::count_duplicates_tuple () const
{
unsigned int *alpha_unify = compute_alpha_unify ();
unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
delete[] alpha_unify;
return count;
}
unsigned int
Search::compute_alpha_size (const unsigned int *alpha_inc) const
{
unsigned int max_alpha_inc = 0;
for (int i = 0; i < _max_key_len; i++)
if (max_alpha_inc < alpha_inc[i])
max_alpha_inc = alpha_inc[i];
return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
}
unsigned int *
Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
{
if (option[UPPERLOWER])
{
unsigned int alpha_size = compute_alpha_size (alpha_inc);
unsigned int *alpha_unify = new unsigned int[alpha_size];
for (unsigned int c = 0; c < alpha_size; c++)
alpha_unify[c] = c;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
PositionIterator iter = positions.iterator(keyword->_allchars_length);
for (int i; (i = iter.next ()) != PositionIterator::EOS; )
{
unsigned int c;
if (i == Positions::LASTCHAR)
c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
else if (i < keyword->_allchars_length)
c = static_cast<unsigned char>(keyword->_allchars[i]);
else
abort ();
if (c >= 'A' && c <= 'Z')
c += 'a' - 'A';
if (c >= 'a' && c <= 'z')
{
if (i != Positions::LASTCHAR)
c += alpha_inc[i];
unsigned int d = alpha_unify[c];
unsigned int b = c - ('a'-'A');
for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
alpha_unify[a] = d;
}
}
}
return alpha_unify;
}
else
return NULL;
}
void
Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
}
unsigned int
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
{
unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
unsigned int count = 0;
{
Hash_Table representatives (_total_keys, option[NOLENGTH]);
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (representatives.insert (keyword))
count++;
}
}
delete_selchars ();
delete[] alpha_unify;
return count;
}
void
Search::find_alpha_inc ()
{
unsigned int duplicates_goal = count_duplicates_tuple ();
unsigned int *current = new unsigned int [_max_key_len];
for (int i = 0; i < _max_key_len; i++)
current[i] = 0;
unsigned int current_duplicates_count = count_duplicates_multiset (current);
if (current_duplicates_count > duplicates_goal)
{
unsigned int nindices;
{
nindices = 0;
PositionIterator iter = _key_positions.iterator(_max_key_len);
for (;;)
{
int key_pos = iter.next ();
if (key_pos == PositionIterator::EOS)
break;
if (key_pos != Positions::LASTCHAR)
nindices++;
}
}
DYNAMIC_ARRAY (indices, unsigned int, nindices);
{
unsigned int j = 0;
PositionIterator iter = _key_positions.iterator(_max_key_len);
for (;;)
{
int key_pos = iter.next ();
if (key_pos == PositionIterator::EOS)
break;
if (key_pos != Positions::LASTCHAR)
indices[j++] = key_pos;
}
if (!(j == nindices))
abort ();
}
DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
do
{
for (unsigned int inc = 1; ; inc++)
{
unsigned int best_duplicates_count = UINT_MAX;
for (unsigned int j = 0; j < nindices; j++)
{
memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
tryal[indices[j]] += inc;
unsigned int try_duplicates_count =
count_duplicates_multiset (tryal);
if (try_duplicates_count < best_duplicates_count)
{
memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
best_duplicates_count = try_duplicates_count;
}
}
if (best_duplicates_count < current_duplicates_count)
{
memcpy (current, best, _max_key_len * sizeof (unsigned int));
current_duplicates_count = best_duplicates_count;
break;
}
}
}
while (current_duplicates_count > duplicates_goal);
FREE_DYNAMIC_ARRAY (tryal);
FREE_DYNAMIC_ARRAY (best);
if (option[DEBUG])
{
fprintf (stderr, "\nComputed alpha increments: ");
bool first = true;
for (unsigned int j = nindices; j-- > 0; )
if (current[indices[j]] != 0)
{
if (!first)
fprintf (stderr, ", ");
fprintf (stderr, "%u:+%u",
indices[j] + 1, current[indices[j]]);
first = false;
}
fprintf (stderr, "\n");
}
FREE_DYNAMIC_ARRAY (indices);
}
_alpha_inc = current;
_alpha_size = compute_alpha_size (_alpha_inc);
_alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
}
void
Search::prepare_asso_values ()
{
KeywordExt_List *temp;
init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
_max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
{
_list_len = _total_keys;
_total_duplicates = 0;
Hash_Table representatives (_list_len, option[NOLENGTH]);
KeywordExt_List *prev = NULL;
for (temp = _head; temp; )
{
KeywordExt *keyword = temp->first();
KeywordExt *other_keyword = representatives.insert (keyword);
KeywordExt_List *garbage = NULL;
if (other_keyword)
{
_total_duplicates++;
_list_len--;
prev->rest() = temp->rest();
garbage = temp;
keyword->_duplicate_link = other_keyword->_duplicate_link;
other_keyword->_duplicate_link = keyword;
if (!option[DUP] || option[DEBUG])
{
fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
keyword->_allchars_length, keyword->_allchars,
other_keyword->_allchars_length, other_keyword->_allchars);
for (int j = 0; j < keyword->_selchars_length; j++)
putc (keyword->_selchars[j], stderr);
fprintf (stderr, "\".\n");
}
}
else
{
keyword->_duplicate_link = NULL;
prev = temp;
}
temp = temp->rest();
if (garbage)
delete garbage;
}
if (option[DEBUG])
representatives.dump();
}
if (_total_duplicates)
{
if (option[DUP])
fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
_total_duplicates);
else
{
fprintf (stderr, "%d input keys have identical hash values,\n",
_total_duplicates);
if (option[POSITIONS])
fprintf (stderr, "try different key positions or use option -D.\n");
else
fprintf (stderr, "use option -D.\n");
exit (1);
}
}
_occurrences = new int[_alpha_size];
memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
for (temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
const unsigned int *ptr = keyword->_selchars;
for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
_occurrences[*ptr]++;
}
_asso_values = new int[_alpha_size];
int non_linked_length = _list_len;
unsigned int asso_value_max;
asso_value_max =
static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
if (asso_value_max == 0)
asso_value_max = 1;
asso_value_max |= asso_value_max >> 1;
asso_value_max |= asso_value_max >> 2;
asso_value_max |= asso_value_max >> 4;
asso_value_max |= asso_value_max >> 8;
asso_value_max |= asso_value_max >> 16;
asso_value_max++;
_asso_value_max = asso_value_max;
_max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
+ (_asso_value_max - 1) * _max_selchars_length;
_collision_detector = new Bool_Array (_max_hash_value + 1);
if (option[DEBUG])
{
fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
"\nmaximum size of generated hash table is %d\n",
non_linked_length, asso_value_max, _max_hash_value);
int field_width;
field_width = 0;
{
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
if (field_width < keyword->_selchars_length)
field_width = keyword->_selchars_length;
}
}
fprintf (stderr, "\ndumping the keyword list without duplicates\n");
fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
int i = 0;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
fprintf (stderr, "%9d, ", ++i);
if (field_width > keyword->_selchars_length)
fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
for (int j = 0; j < keyword->_selchars_length; j++)
putc (keyword->_selchars[j], stderr);
fprintf (stderr, ", %.*s\n",
keyword->_allchars_length, keyword->_allchars);
}
fprintf (stderr, "\nend of keyword list\n\n");
}
if (option[RANDOM] || option.get_jump () == 0)
srand (static_cast<long>(time (0)));
_initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
_jump = option.get_jump ();
}
struct EquivalenceClass
{
KeywordExt_List * _keywords;
KeywordExt_List * _keywords_last;
unsigned int _cardinality;
unsigned int * _undetermined_chars;
unsigned int _undetermined_chars_length;
EquivalenceClass * _next;
};
struct Step
{
unsigned int _changing_count;
unsigned int * _changing;
unsigned int _asso_value_max;
bool * _undetermined;
EquivalenceClass * _partition;
double _expected_lower;
double _expected_upper;
Step * _next;
};
static inline bool
equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
{
while (len > 0)
{
if (*ptr1 != *ptr2)
return false;
ptr1++;
ptr2++;
len--;
}
return true;
}
EquivalenceClass *
Search::compute_partition (bool *undetermined) const
{
EquivalenceClass *partition = NULL;
EquivalenceClass *partition_last = NULL;
for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
unsigned int *undetermined_chars =
new unsigned int[keyword->_selchars_length];
unsigned int undetermined_chars_length = 0;
for (int i = 0; i < keyword->_selchars_length; i++)
if (undetermined[keyword->_selchars[i]])
undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
EquivalenceClass *equclass;
for (equclass = partition; equclass; equclass = equclass->_next)
if (equclass->_undetermined_chars_length == undetermined_chars_length
&& equals (equclass->_undetermined_chars, undetermined_chars,
undetermined_chars_length))
break;
if (equclass == NULL)
{
equclass = new EquivalenceClass();
equclass->_keywords = NULL;
equclass->_keywords_last = NULL;
equclass->_cardinality = 0;
equclass->_undetermined_chars = undetermined_chars;
equclass->_undetermined_chars_length = undetermined_chars_length;
equclass->_next = NULL;
if (partition)
partition_last->_next = equclass;
else
partition = equclass;
partition_last = equclass;
}
else
delete[] undetermined_chars;
KeywordExt_List *cons = new KeywordExt_List(keyword);
if (equclass->_keywords)
equclass->_keywords_last->rest() = cons;
else
equclass->_keywords = cons;
equclass->_keywords_last = cons;
equclass->_cardinality++;
}
for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
delete[] cls->_undetermined_chars;
return partition;
}
static void
delete_partition (EquivalenceClass *partition)
{
while (partition != NULL)
{
EquivalenceClass *equclass = partition;
partition = equclass->_next;
delete_list (equclass->_keywords);
delete equclass;
}
}
unsigned int
Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
{
unsigned int sum = 0;
unsigned int m = _max_selchars_length;
DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
{
for (unsigned int i = 0; i <= m; i++)
split_cardinalities[i] = 0;
for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
unsigned int count = 0;
for (int i = 0; i < keyword->_selchars_length; i++)
if (keyword->_selchars[i] == c)
count++;
split_cardinalities[count]++;
}
sum += cls->_cardinality * cls->_cardinality;
for (unsigned int i = 0; i <= m; i++)
sum -= split_cardinalities[i] * split_cardinalities[i];
}
FREE_DYNAMIC_ARRAY (split_cardinalities);
return sum;
}
bool
Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
{
for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
{
unsigned int first_count = UINT_MAX;
for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
unsigned int count = 0;
for (int i = 0; i < keyword->_selchars_length; i++)
if (keyword->_selchars[i] == c)
count++;
if (temp == cls->_keywords)
first_count = count;
else if (count != first_count)
return false;
}
}
return true;
}
void
Search::find_asso_values ()
{
Step *steps;
{
bool *undetermined;
bool *determined;
steps = NULL;
undetermined = new bool[_alpha_size];
for (unsigned int c = 0; c < _alpha_size; c++)
undetermined[c] = false;
determined = new bool[_alpha_size];
for (unsigned int c = 0; c < _alpha_size; c++)
determined[c] = true;
for (;;)
{
EquivalenceClass *partition = compute_partition (undetermined);
unsigned int chosen_c;
unsigned int chosen_possible_collisions;
{
unsigned int best_c = 0;
unsigned int best_possible_collisions = UINT_MAX;
for (unsigned int c = 0; c < _alpha_size; c++)
if (_occurrences[c] > 0 && determined[c])
{
unsigned int possible_collisions =
count_possible_collisions (partition, c);
if (possible_collisions < best_possible_collisions)
{
best_c = c;
best_possible_collisions = possible_collisions;
}
}
if (best_possible_collisions == UINT_MAX)
{
delete_partition (partition);
break;
}
chosen_c = best_c;
chosen_possible_collisions = best_possible_collisions;
}
Step *step = new Step();
step->_undetermined = new bool[_alpha_size];
memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
step->_partition = partition;
undetermined[chosen_c] = true;
partition = compute_partition (undetermined);
for (unsigned int c = 0; c < _alpha_size; c++)
if (_occurrences[c] > 0 && determined[c]
&& unchanged_partition (partition, c))
{
undetermined[c] = true;
determined[c] = false;
}
if (determined[chosen_c])
abort ();
unsigned int changing_count;
changing_count = 0;
for (unsigned int c = 0; c < _alpha_size; c++)
if (undetermined[c] && !step->_undetermined[c])
changing_count++;
unsigned int *changing = new unsigned int[changing_count];
changing_count = 0;
for (unsigned int c = 0; c < _alpha_size; c++)
if (undetermined[c] && !step->_undetermined[c])
changing[changing_count++] = c;
step->_changing = changing;
step->_changing_count = changing_count;
step->_asso_value_max = _asso_value_max;
step->_expected_lower =
exp (static_cast<double>(chosen_possible_collisions)
/ static_cast<double>(_max_hash_value));
step->_expected_upper =
exp (static_cast<double>(chosen_possible_collisions)
/ static_cast<double>(_asso_value_max));
delete_partition (partition);
step->_next = steps;
steps = step;
}
delete[] determined;
delete[] undetermined;
}
if (option[DEBUG])
{
unsigned int stepno = 0;
for (Step *step = steps; step; step = step->_next)
{
stepno++;
fprintf (stderr, "Step %u chooses _asso_values[", stepno);
for (unsigned int i = 0; i < step->_changing_count; i++)
{
if (i > 0)
fprintf (stderr, ",");
fprintf (stderr, "'%c'", step->_changing[i]);
}
fprintf (stderr, "], expected number of iterations between %g and %g.\n",
step->_expected_lower, step->_expected_upper);
fprintf (stderr, "Keyword equivalence classes:\n");
for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
{
fprintf (stderr, "\n");
for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
{
KeywordExt *keyword = temp->first();
fprintf (stderr, " %.*s\n",
keyword->_allchars_length, keyword->_allchars);
}
}
fprintf (stderr, "\n");
}
}
for (unsigned int c = 0; c < _alpha_size; c++)
_asso_values[c] = 0;
unsigned int stepno = 0;
for (Step *step = steps; step; step = step->_next)
{
stepno++;
unsigned int k = step->_changing_count;
for (unsigned int i = 0; i < k; i++)
{
unsigned int c = step->_changing[i];
_asso_values[c] =
(_initial_asso_value < 0 ? rand () : _initial_asso_value)
& (step->_asso_value_max - 1);
}
unsigned int iterations = 0;
DYNAMIC_ARRAY (iter, unsigned int, k);
for (unsigned int i = 0; i < k; i++)
iter[i] = 0;
unsigned int ii = (_jump != 0 ? k - 1 : 0);
for (;;)
{
bool has_collision = false;
for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
{
_collision_detector->clear ();
for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
{
KeywordExt *keyword = ptr->first();
int hashcode;
{
int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
const unsigned int *p = keyword->_selchars;
int i = keyword->_selchars_length;
for (; i > 0; p++, i--)
if (!step->_undetermined[*p])
sum += _asso_values[*p];
hashcode = sum;
}
if (_collision_detector->set_bit (hashcode))
{
has_collision = true;
break;
}
}
if (has_collision)
break;
}
iterations++;
if (!has_collision)
break;
if (_jump != 0)
{
unsigned int bound = iter[ii];
unsigned int i = 0;
while (i < ii)
{
unsigned int c = step->_changing[i];
iter[i]++;
_asso_values[c] =
(_asso_values[c] + _jump) & (step->_asso_value_max - 1);
if (iter[i] <= bound)
goto found_next;
_asso_values[c] =
(_asso_values[c] - iter[i] * _jump)
& (step->_asso_value_max - 1);
iter[i] = 0;
i++;
}
i = ii + 1;
while (i < k)
{
unsigned int c = step->_changing[i];
iter[i]++;
_asso_values[c] =
(_asso_values[c] + _jump) & (step->_asso_value_max - 1);
if (iter[i] < bound)
goto found_next;
_asso_values[c] =
(_asso_values[c] - iter[i] * _jump)
& (step->_asso_value_max - 1);
iter[i] = 0;
i++;
}
{
unsigned int c = step->_changing[ii];
_asso_values[c] =
(_asso_values[c] - bound * _jump)
& (step->_asso_value_max - 1);
iter[ii] = 0;
}
ii++;
if (ii == k)
{
ii = 0;
bound++;
if (bound == step->_asso_value_max)
{
step->_asso_value_max = 2 * step->_asso_value_max;
if (step->_asso_value_max > _asso_value_max)
{
_asso_value_max = step->_asso_value_max;
_max_hash_value =
(option[NOLENGTH] ? 0 : _max_key_len)
+ (_asso_value_max - 1) * _max_selchars_length;
delete _collision_detector;
_collision_detector =
new Bool_Array (_max_hash_value + 1);
}
}
}
{
unsigned int c = step->_changing[ii];
iter[ii] = bound;
_asso_values[c] =
(_asso_values[c] + bound * _jump)
& (step->_asso_value_max - 1);
}
found_next: ;
}
else
{
unsigned int c = step->_changing[ii];
_asso_values[c] =
(_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
ii++;
if (ii == k)
ii = 0;
}
}
FREE_DYNAMIC_ARRAY (iter);
if (option[DEBUG])
{
fprintf (stderr, "Step %u chose _asso_values[", stepno);
for (unsigned int i = 0; i < step->_changing_count; i++)
{
if (i > 0)
fprintf (stderr, ",");
fprintf (stderr, "'%c'", step->_changing[i]);
}
fprintf (stderr, "] in %u iterations.\n", iterations);
}
}
while (steps != NULL)
{
Step *step = steps;
steps = step->_next;
delete[] step->_changing;
delete[] step->_undetermined;
delete_partition (step->_partition);
delete step;
}
}
inline int
Search::compute_hash (KeywordExt *keyword) const
{
int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
const unsigned int *p = keyword->_selchars;
int i = keyword->_selchars_length;
for (; i > 0; p++, i--)
sum += _asso_values[*p];
return keyword->_hash_value = sum;
}
void
Search::find_good_asso_values ()
{
prepare_asso_values ();
int asso_iteration;
if ((asso_iteration = option.get_asso_iterations ()) == 0)
find_asso_values ();
else
{
KeywordExt_List *saved_head = _head;
int best_initial_asso_value = 0;
int best_jump = 1;
int *best_asso_values = new int[_alpha_size];
int best_collisions = INT_MAX;
int best_max_hash_value = INT_MAX;
_initial_asso_value = 0; _jump = 1;
for (;;)
{
_head = copy_list (saved_head);
find_asso_values ();
int collisions = 0;
int max_hash_value = INT_MIN;
_collision_detector->clear ();
for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
{
KeywordExt *keyword = ptr->first();
int hashcode = compute_hash (keyword);
if (max_hash_value < hashcode)
max_hash_value = hashcode;
if (_collision_detector->set_bit (hashcode))
collisions++;
}
if (collisions < best_collisions
|| (collisions == best_collisions
&& max_hash_value < best_max_hash_value))
{
memcpy (best_asso_values, _asso_values,
_alpha_size * sizeof (_asso_values[0]));
best_collisions = collisions;
best_max_hash_value = max_hash_value;
}
delete_list (_head);
if (--asso_iteration == 0)
break;
if (_initial_asso_value >= 2)
_initial_asso_value -= 2, _jump += 2;
else
_initial_asso_value += _jump, _jump = 1;
}
_head = saved_head;
_initial_asso_value = best_initial_asso_value;
_jump = best_jump;
memcpy (_asso_values, best_asso_values,
_alpha_size * sizeof (_asso_values[0]));
delete[] best_asso_values;
}
}
static bool
less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
{
return keyword1->_hash_value < keyword2->_hash_value;
}
void
Search::sort ()
{
_head = mergesort_list (_head, less_by_hash_value);
}
void
Search::optimize ()
{
prepare ();
find_positions ();
find_alpha_inc ();
find_good_asso_values ();
_collision_detector->clear ();
for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
{
KeywordExt *curr = curr_ptr->first();
unsigned int hashcode = compute_hash (curr);
if (_collision_detector->set_bit (hashcode))
{
fprintf (stderr,
"\nInternal error, unexpected duplicate hash code\n");
if (option[POSITIONS])
fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
else
fprintf (stderr, "try options -m or -r.\n\n");
exit (1);
}
}
sort ();
int max_hash_value;
{
KeywordExt_List *temp;
for (temp = _head; temp->rest(); temp = temp->rest())
;
max_hash_value = temp->first()->_hash_value;
}
for (unsigned int c = 0; c < _alpha_size; c++)
if (_occurrences[c] == 0)
_asso_values[c] = max_hash_value + 1;
if (_alpha_unify)
for (unsigned int c = 0; c < _alpha_size; c++)
if (_alpha_unify[c] != c)
_asso_values[c] = _asso_values[_alpha_unify[c]];
}
Search::~Search ()
{
delete _collision_detector;
if (option[DEBUG])
{
fprintf (stderr, "\ndumping occurrence and associated values tables\n");
for (unsigned int i = 0; i < _alpha_size; i++)
if (_occurrences[i])
fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
i, _asso_values[i], i, _occurrences[i]);
fprintf (stderr, "end table dumping\n");
fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
"\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
_list_len, _total_keys, _total_duplicates, _max_key_len);
int field_width = _max_selchars_length;
fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
field_width, "selchars");
for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
{
fprintf (stderr, "%11d,%11d,%6d, ",
ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
if (field_width > ptr->first()->_selchars_length)
fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
for (int j = 0; j < ptr->first()->_selchars_length; j++)
putc (ptr->first()->_selchars[j], stderr);
fprintf (stderr, ", %.*s\n",
ptr->first()->_allchars_length, ptr->first()->_allchars);
}
fprintf (stderr, "End dumping list.\n\n");
}
delete[] _asso_values;
delete[] _occurrences;
delete[] _alpha_unify;
delete[] _alpha_inc;
}