#include <glib.h>
#include <stdio.h>

GHashTable *result_hash;
GHashTable *string_hash;
gint collisions = 0;
guint *values;
gint nvalues = 0;

extern gint g_primes[];
extern gint g_nprimes;

#define HASH_FUNC g_str_hash
/* #define HASH_FUNC procedural_db_hash_func */


static guint
procedural_db_hash_func (gconstpointer key)
{
  const gchar *string;
  guint result;
  int c;

  /*
   * I tried a zillion different hash functions and asked many other
   * people for advice.  Many people had their own favorite functions,
   * all different, but no-one had much idea why they were good ones.
   * I chose the one below (multiply by 9 and add new character)
   * because of the following reasons:
   *
   * 1. Multiplying by 10 is perfect for keys that are decimal strings,
   *    and multiplying by 9 is just about as good.
   * 2. Times-9 is (shift-left-3) plus (old).  This means that each
   *    character's bits hang around in the low-order bits of the
   *    hash value for ever, plus they spread fairly rapidly up to
   *    the high-order bits to fill out the hash value.  This seems
   *    works well both for decimal and non-decimal strings.
   *
   * tclHash.c --
   *
   *      Implementation of in-memory hash tables for Tcl and Tcl-based
   *      applications.
   *
   * Copyright (c) 1991-1993 The Regents of the University of California.
   * Copyright (c) 1994 Sun Microsystems, Inc.
   */

  string = (const gchar *) key;
  result = 0;
  while (1)
    {
      c = *string;
      string++;
      if (c == 0)
	break;
      result += (result << 5) + c;
    }

  return result;
}

/* Print out the contents of an entry, and then free the memory it uses
 */
void
add_value (gpointer key, gpointer value, gpointer data)
{
  gint *count = data;

  values[*count] = GPOINTER_TO_INT(key);
  (*count)++;
}

/* Find all words buffer and add them to the index
 */
void
process_line (gchar *buffer, gint lineno)
{
  guint hash_value;
  guint len = strlen(buffer);

  if (buffer[len-1] == '\n')
    buffer[len-1] = '\0';

  if (g_hash_table_lookup (string_hash, buffer))
    {
      return;			/* duplicate line */
    }
  else
    {
      g_hash_table_insert (string_hash, g_strdup(buffer), buffer);
    }

  hash_value = HASH_FUNC (buffer);

  if (g_hash_table_lookup (result_hash,
			   GINT_TO_POINTER (hash_value)))
    {
      collisions++;
    }
  else
    {
      nvalues++;
      g_hash_table_insert (result_hash,
			   GINT_TO_POINTER (hash_value),
			   GINT_TO_POINTER (1));
    }
}

void
test_prime (gint prime)
{
  gint *bins;
  gint *hist;
  gint min= G_MAXINT;
  gint max = 0;
  gint maxy, graph_max;
  gdouble avg;
  int i, j;

  avg = (double)nvalues/prime;
  /* Only calculate details for histograms that have some relation to reality */
  if ((max - min > 70) || (avg < 1.0 || avg > 10.0))
    return;

  bins = g_new0 (gint, prime);
  
  for (i=0; i<nvalues; i++)
    {
      bins[values[i] % prime]++;
    }

  for (i=0; i<prime; i++)
    {
      if (bins[i] < min)
	min = bins[i];
      if (bins[i] > max)
	max = bins[i];
    }
  g_print ("%d: min %d; max %d; avg %.2f\n", prime, min, max, avg);

  /* Plot  a histogram up bin filling */

  hist = g_new0 (gint, max+1);

  maxy = 0;
  for (i=0; i<prime; i++)
    {
      if (++hist[bins[i]] > maxy)
	maxy = hist[bins[i]];
    }

  /* Pick a reasonable scale */
  graph_max = 1;
  while (graph_max < maxy)
    graph_max *= 10;
  if (graph_max / 5 > maxy)
    graph_max /= 5;
  else if (graph_max / 2 > maxy)
    graph_max /= 2;

  for (j=0; j<11; j++)
    {
      for (i=0; i<=max; i++)
	{
	  fputc ((10-j)/10. * graph_max < hist[i] ? '*' : ' ', stdout);
	}
      printf ("  %.1f\n", (10-j)/10. * graph_max);
    }

  g_free (bins);
}

#define BUFSIZE 1024

int main ()
{
  gchar buffer[BUFSIZE];
  gint lineno = 0;
  gint count = 0;
  gint i;
  
  string_hash = g_hash_table_new (g_str_hash, g_str_equal);
  result_hash = g_hash_table_new (g_direct_hash, NULL);

  /* Read and process the lines from stdin 
   */
  while (fgets (buffer, BUFSIZE-1, stdin) != NULL)
    {
      process_line (buffer, lineno);
      lineno++;
    }

  g_print ("%d distinct values, %d collisions\n", nvalues, collisions);

  values = g_new (guint, nvalues);
  g_hash_table_foreach (result_hash, add_value, &count);

  for (i=0; i<g_nprimes; i++)
    test_prime (g_primes[i]);
  
  return 0;
}


