/*
 Copyright (c) 1997, Kevin Lang, NEC Research Institute, 
 See NECI license below.

  How to compile:

  gcc -Wall -O4 -o rlb rlb.c data-structures-1.c -DNOISY -DUSE_GC ~/lib/gc.a 
  
  or 

  gcc -Wall -O4 -o rlb rlb.c data-structures-1.c -DNOISY -DUSE_FREE

  This program is a translation of the Scheme program merge-order.scm,
  which should be available, along with a write-up of the algorithm,
  on the web site http://abbadingo.cs.unm.edu

  The program's command-line arguments are the name of a training set
  file (in the same format as the Abbadingo training files), and a
  value for the heuristic control parameter "window".  We suggest
  setting window to be twice the number of states you hope to obtain.

  This program can solve Abbadingo problems 1, 2, 3, R, 4, 6, and S.
  
  Unlike the scheme version of the program, this C version doesn't
  maintain hash tables of nodes that are believed to be incompatible.
  That stuff complicates the code without making it a lot faster.

  The program can be compiled either with explicit calls to free, or
  with Boehm's very fine garbage collector for C (available at
  http://reality.sgi.com/employees/boehm_mti/gc.html).  The program is
  nearly the same speed either way, and if you intend to play around
  with the algorithm, I would highly recommend going the garbage
  collector route to avoid the hassle of explicit storage management.

  Run times on problem 4 using my i486 laptop:

    no reclamation.......41 sec
    explicit frees.......50 sec
    garbage collection...54 sec

  Note that the program's run time is quadratic in the value of
  window, and is potentially exponential in the length of the longest
  training string.  This latter fact isn't inherent to the basic
  algorithm, but rather to the way that compute_match_score() is
  implemented by a walk of the hypothesis graph out to a given depth.

 */



/************************************************************************/



/*
  
 Copyright (c) 1997, NEC Research Institute Inc.  All rights reserved. 
 
 Permission to use, copy, modify, and distribute this software and its
 associated documentation for non-commercial purposes is hereby
 granted, provided that the above copyright notice appears in all
 copies, derivative works or modified versions of the software and any
 portions thereof, and that both the copyright notice and this
 permission notice appear in the documentation.  NEC Research Institute
 Inc. shall be given a copy of any such derivative work or modified
 version of the software and NEC Research Institute Inc. and its
 affiliated companies (collectively referred to as NECI) shall be
 granted permission to use, copy, modify and distribute the software
 for internal use and research.  The name of NEC Research Institute
 Inc. and its affiliated companies shall not be used in advertising or
 publicity related to the distribution of the software, without the
 prior written consent of NECI.  All copies, derivative works or
 modified versions of the software shall be exported or reexported in
 accordance with applicable laws and regulations relating to export
 control.  This software is experimental.  NECI does not make any
 representations regarding the suitability of this software for any
 purpose and NECI will not support the software.  
 
 THE SOFTWARE IS PROVIDED AS IS.  NECI DOES NOT MAKE ANY WARRANTIES
 EITHER EXPRESS OR IMPLIED WITH REGARD TO THE SOFTWARE.  NECI ALSO
 DISCLAIMS ANY WARRANTY THAT THE SOFTWARE IS FREE OF INFRINGEMENT OF
 ANY INTELLECTUAL PROPERTY RIGHTS OF OTHERS.  NO OTHER LICENSE EXPRESS
 OR IMPLIED IS HEREBY GRANTED. NECI SHALL NOT BE LIABLE FOR ANY
 DAMAGES, INCLUDING GENERAL, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
 DAMAGES, ARISING OUT OF THE USE OR INABILITY TO USE THE SOFTWARE.

*/

/************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <sys/types.h>
#include <sys/time.h>
#include <setjmp.h>

#include "data-structures-1.h"


#ifdef USE_GC
#include "../include/gc.h"
#else
#include <malloc.h>
#endif


#ifndef dotimes
#define dotimes(var,lim) for (var=0;var<(lim);var++) 
#define do_cdrs(var,srclist) for (var=srclist;var!=NULL;var=CDR(var))
#define MAX(a,b) (((a)>(b)) ? (a) : (b))
#define MIN(a,b) (((a)<(b)) ? (a) : (b))
#define ABS(zz) ((zz)<0 ? -1*(zz) : (zz))
#endif

#ifndef boolean
#define boolean int
#define false 0
#define true 1
#endif


int alphabet_size = 2;



typedef struct tree_node
{
  int gloid_slot;
  int label_slot;
  int depth_slot;
  int saved_label_slot;
  struct tree_node ** child_array_slot;
} treeNode;


#define NONDEF NULL

#define nondef_p(tn)         ((tn)==NONDEF)

#define gloid(tn)            ((tn)->gloid_slot)
#define get_label(tn)        ((tn)->label_slot)
#define get_depth(tn)        ((tn)->depth_slot)
#define get_saved_label(tn)  ((tn)->saved_label_slot)
#define get_child(tn,i)      ((tn)->child_array_slot[(i)])

#define set_label(tn,new)       ((tn)->label_slot=(new))
#define set_saved_label(tn,new) ((tn)->saved_label_slot=(new))
#define set_child(tn,i,new)     ((tn)->child_array_slot[(i)]=(new))



inline boolean label_defined (int lab)
{
  return (! (-1 == lab));
}


/***************************************************************/


int global_node_counter = 0;


treeNode * make_node (int depth)
{
  int i;
  treeNode * tn = (treeNode *) my_malloc (sizeof(treeNode));
  treeNode ** ch = (treeNode **) my_malloc (alphabet_size*sizeof(treeNode *));

  tn->gloid_slot = global_node_counter++;
  tn->label_slot = -1;
  tn->saved_label_slot = -1;
  tn->depth_slot = depth;
  tn->child_array_slot = ch;
  dotimes (i, alphabet_size) ch[i] = NONDEF;

  return tn;
}

/*********************************************************************/

void depth_first_print (treeNode * curnode)
{
  int i;
  if (nondef_p (curnode)) printf("?");
  else {
    printf("[%d %d ", gloid (curnode), get_label (curnode));
    dotimes (i, alphabet_size)
      depth_first_print(get_child(curnode, i));
    printf("]");
  }
}


void augment_tree (treeNode * curnode, IP_Pair * str, int label)
{
  if (str == NULL)
    set_label(curnode,label);
  else {
    int cursym = ICAR(str);
    if (nondef_p(get_child(curnode,cursym)))
      set_child(curnode,cursym,make_node(1+get_depth(curnode)));
    augment_tree(get_child(curnode,cursym), CDR(str), label);
  }
}




/*********************************************************************/


treeNode * new_build_tree (PP_Pair * example_list)
{
  treeNode * root = make_node (0);
  PP_Pair * l = example_list;
  while (l != NULL) {
    IP_Pair * example = PCAR(l);
    augment_tree(root,CDR(example),ICAR(example));
    l = CDR(l); 
  }
  return root;
}


int read_cform_called_yet = 0;
int length_of_longest_string = -1;


PP_Pair * read_cform_trainset (char * name)
{
  FILE * infile = fopen (name, "r");
  int n_strings, i, j;
  PP_Pair * accumulate = NULL;
  PP_Pair * result;

  assert(read_cform_called_yet == 0);
  read_cform_called_yet = 1;

  if (infile == NULL) {
    printf("file %s not found\n", name);
    exit(-1);
  }

  fscanf(infile, "%d", &n_strings);
  fscanf(infile, "%d", &alphabet_size);

  dotimes (i, n_strings) {
    int lab, strlen, itmp;
    IP_Pair * rb2 = NULL;
    fscanf(infile, "%d", &lab);
    fscanf(infile, "%d", &strlen);
    dotimes (j, strlen) {
      fscanf(infile, "%d", &itmp);
      rb2 = ilist_cons(itmp, rb2);
    }
    length_of_longest_string = MAX(strlen,length_of_longest_string);
    accumulate = plist_cons (ilist_cons(lab,ilist_reverse (rb2)), accumulate);
#ifdef USE_FREE
    ilist_dispose(rb2);
#endif
  }
  fclose (infile);
  result = plist_reverse (accumulate);
#ifdef USE_FREE
    plist_dispose(accumulate);
#endif
  return result;
}



/*********************************************************************/


/* note: for unknown labels and transitions, we are 
   currently printing zeros instead of -1 and ? 
   like we used to.
*/


void fancy_print_tree (treeNode * start_node)
{
  I_Table * seen = make_itable ();
  I_Table * id_table = make_itable ();
  P_Queue * todo = make_pqueue ();
  int count = 0;
  int i;

  pqueue_push (todo, (void *) start_node);

  while (!pqueue_empty_p(todo)) {
    treeNode * curnode = pqueue_pop(todo);
    if (!itable_test(id_table,gloid(curnode))) {
      itable_set (id_table, gloid(curnode), count++);
      dotimes (i, alphabet_size) {
	treeNode * x = get_child(curnode,i);
	if (!nondef_p(x)) pqueue_push(todo,x);
      }
    }
  }

  printf("%d %d\n", count, alphabet_size);

  pqueue_push (todo, (void *) start_node);

  while (!pqueue_empty_p(todo)) {
    treeNode * curnode = pqueue_pop(todo);
    int lab = get_label(curnode);
    if (!itable_test_and_set(seen,gloid(curnode))) {
      printf("%d %d ",
	     itable_lookup(id_table, gloid(curnode)),
	     (lab == -1) ? 0 : lab);
      dotimes (i, alphabet_size) {
	treeNode * x = get_child(curnode,i);
	if (nondef_p(x))
	  printf("0 ");
	else {
	  pqueue_push(todo,x);
	  printf("%d ", itable_lookup(id_table,gloid(x)));
	}
      }
      printf("\n");
    }
  }
}



/*********************************************************************/



PP_Pair * make_breadth_first_node_list (treeNode * start_node)
{
  I_Table * seen = make_itable ();
  P_Queue * todo = make_pqueue ();
  PP_Pair * outlist = NULL;
  PP_Pair * result;
  int i;

  pqueue_push (todo, (void *) start_node);

  while (!pqueue_empty_p(todo)) {
    treeNode * curnode = pqueue_pop(todo);
    if (!itable_test(seen,gloid(curnode))) {
      itable_set (seen, gloid(curnode), 1);
      outlist = plist_cons (curnode, outlist);
      dotimes (i, alphabet_size) {
	treeNode * x = get_child(curnode,i);
	if (!nondef_p(x)) pqueue_push(todo,x);
      }
    }
  }

  result = plist_reverse (outlist);

#ifdef USE_FREE
  deallocate_pqueue (todo);
  deallocate_itable (seen);
  plist_dispose (outlist);
#endif

  return result;
}



/*********************************************************************/




/* This makes sure that the hypothesis is consistent with a given
   training example, adding on to the hypothesis if necessary.  */


void relabel_check_and_augment_graph
     (treeNode * curnode, IP_Pair * string, int label,
      PP_Pair  ** ptr_to_backout_list,
      jmp_buf bailout)  
{
  if (NULL==string) {

    if (label_defined(get_label(curnode))) {
      if (!(get_label(curnode)) == label) {
	longjmp(bailout,-1);
      }
    }
    else {
      set_label(curnode,label);
    }
  }
  else {
    int cursym = ICAR(string);
    if (nondef_p(get_child(curnode,cursym))) {
      treeNode * newnode = make_node (1 + get_depth(curnode));
      *ptr_to_backout_list = plist_cons(make_ipp_triple(cursym,curnode,NONDEF),
					*ptr_to_backout_list);
      set_child(curnode,cursym,newnode);
    }
    relabel_check_and_augment_graph(get_child(curnode,cursym),
				    CDR(string),
				    label,ptr_to_backout_list,bailout);
  }
}



void undo_structural_changes (PP_Pair * backout_list)
{

  PP_Pair * backouts;
  do_cdrs (backouts, backout_list) {
    IPP_Triple * backout = PCAR(backouts);
    set_child ((treeNode *) backout->p2, backout->i1,
	       (treeNode *) backout->p3);
  }
}



boolean perform_merge_if_legal (treeNode * cand_1,
				treeNode * cand_2,
				treeNode * root,
				PP_Pair * trainset,
				PP_Pair * all_nodes)
{
  PP_Pair * nodes;
  treeNode * node;
  PP_Pair * backout_list = NULL;
  treeNode * live, * dead;
  jmp_buf bailout;
  int i;

  /* the deeper guy dies */

  if (get_depth(cand_1) < get_depth(cand_2)) {live = cand_1; dead = cand_2;}
  else                                       {dead = cand_1; live = cand_2;}

  /* re-direct arcs from dead guy to live guy */

  do_cdrs (nodes,all_nodes) {
    node = PCAR(nodes);
    set_saved_label(node,get_label(node));
    set_label(node,-1);
    dotimes (i, alphabet_size) {
      if (get_child(node,i) == dead) {
	backout_list = plist_cons(make_ipp_triple(i,node,dead),backout_list);
	set_child(node,i,live);
      }
    }
  }

  /* make sure we are still consistent with training set */

  if (setjmp(bailout) == -1) {
  /* this bailout is from the call to relabel_check_and_augment_graph below */
    /* repair structure */
    undo_structural_changes (backout_list);
    /* restore labels */
    do_cdrs (nodes,all_nodes) {
      node = PCAR(nodes);
      set_label(node,get_saved_label(node));
    }
#ifdef USE_FREE
    deallocate_list_of_ipp (backout_list);
#endif
    return false;
  }
  else {
    PP_Pair * examples;
    IP_Pair * example;
    do_cdrs (examples, trainset) {
      example = PCAR(examples);
      relabel_check_and_augment_graph(root,CDR(example),ICAR(example),
			      &backout_list, bailout);
    }
#ifdef USE_FREE
    deallocate_list_of_ipp (backout_list);
#endif
    return true;
  }

}


/*********************************************************************/



/* Here we compute the main part of the heuristic merge score,
   the number of actively matching labels within a relevant depth.
   If a mismatch is found, we bail out immediately.
 */   


void bailing_walkit (treeNode * x, treeNode * y,
		     int walk_depth, int * match_count_ptr,
		     jmp_buf bailout)
{
  int i;

  if (get_label(x) != -1 && get_label(y) != -1) {
    if (get_label(x) == get_label(y)) 
      *match_count_ptr = *match_count_ptr + 1;
    else 
      longjmp(bailout, -1);
  }
  if (walk_depth > 0) {
    dotimes (i, alphabet_size) {
      treeNode * x_child = get_child(x,i);
      treeNode * y_child = get_child(y,i);
      if (!(nondef_p(x_child)) && !(nondef_p(y_child)))
	bailing_walkit(x_child, y_child,
		       walk_depth-1, match_count_ptr,
		       bailout);
    }
  }
}


int compute_match_score (treeNode * cand_1, treeNode * cand_2)
{
  int len1 = length_of_longest_string - get_depth(cand_1);
  int len2 = length_of_longest_string - get_depth(cand_2);
  int match_count = 0;
  jmp_buf bailout;

  if (setjmp(bailout) == -1)
    return -1;
  else {
    bailing_walkit(cand_1, cand_2,
		   MIN(len1, len2),
		   &match_count,
		   bailout);
    return match_count;
  }
}

/* Note that we never recompute the node depths, which are set when
   the prefix tree acceptor is first built.  This is because we only
   make backwards or sideways merges, which never introduce a shorter
   path from the root to a node.  */

/*********************************************************************/


#define DO_NOTHING

#define SORT_THRESHOLD 16


void really_qsort_index_f (i, j, key, perm)
     int i,j;
     float *key;
     int *perm;
{
  int r,l,k, min_i, swap;
  float min_key, comp_key;
  
  if ((j - i) < SORT_THRESHOLD) {
    for (l=i;l<j;l++)
      {
	min_i = l;
	min_key = key[perm[min_i]];
	for(k=l+1;k<j;k++)
	  {
	    comp_key = key[perm[k]];
	    if (min_key > comp_key)
	      {
		min_i = k;
		min_key = comp_key;
	      }
	  }
	swap = perm[min_i];
	perm[min_i] = perm[l];
	perm[l]= swap;
      }
  }

  else

    {
/*      r = myrandomint(i,j-1); */
      r = (i+j)/2;   /* got rid of randomization */

      {int tmp = perm[r]; perm[r] = perm[i]; perm[i] = tmp;} 

      {
	float x = key[perm[i]];
	int ii = i - 1;
	int jj = j;

	while (1) {

	  while (key[perm[--jj]] > x) DO_NOTHING;
	  while (key[perm[++ii]] < x) DO_NOTHING;

	  if (ii < jj) {
	    int tmp = perm[jj]; perm[jj] = perm[ii]; perm[ii] = tmp;
	  }
	  else {
	    r = jj;
	    break;
	  }
	}
      }

      really_qsort_index_f(i,r+1,key,perm);
      really_qsort_index_f(r+1,j,key,perm);
    }
}



/* This creates a permutation with the property
   that key[perm[i]] is in nondecreasing order.
   The vector of keys is not modified.
*/


void qsort_index_f (int n, float *key, int *perm)
{
  int i;
  dotimes (i, n) perm[i] = i;
  really_qsort_index_f(0, n, key, perm);
  dotimes (i, n-1)
    assert(key[perm[i]] <= key[perm[i+1]]);
}



/************************************************************/


#ifdef NOISY
#define NOISE_1 fprintf(stderr, "%d nodes, ", plist_length(all_nodes)); fflush(stderr);
#define NOISE_2 fprintf(stderr, "%d legal merges, ", actual_n_cands); fflush(stderr); 
#define NOISE_3 fprintf(stderr, "merging %d %d (score %g) -> %s\n", gloid(c1[perm[i]]), gloid(c2[perm[i]]), scores[perm[i]], succeeded_p ? "ok" : "failed"); 
#else
#define NOISE_1
#define NOISE_2
#define NOISE_3
#endif



/************************************************************/


/* main routine */


void arb_collapse (treeNode * root, PP_Pair * trainset, int window)
{
  /* These variables are declared static to fix compiler warnings thay
     they might be trashed by setjmp/longjmp.  The declarations are
     acceptable because the function is only called once. */

  static PP_Pair *all_nodes = NULL;
  static PP_Pair *window_nodes = NULL;
  static PP_Pair *p1, *p2;

  static int actual_n_cands;
  int potential_n_cands;
  treeNode **c1;
  treeNode **c2;
  float *scores;
  int * perm;

#ifdef DOUBLE_WINDOW
window_loop:
#endif

  actual_n_cands = 0;
  potential_n_cands = window * (window-1) / 2;
  c1 = (treeNode **)my_malloc(potential_n_cands*sizeof(treeNode *));
  c2 = (treeNode **)my_malloc(potential_n_cands*sizeof(treeNode *));
  scores = (float *)    my_malloc (potential_n_cands * sizeof(float));
  perm    = (int *)      my_malloc (potential_n_cands * sizeof(int));

outer_loop:

#ifdef USE_FREE      
  plist_dispose(all_nodes);
  plist_dispose(window_nodes);
#endif

  all_nodes = make_breadth_first_node_list (root);
  window_nodes = plist_safe_head(all_nodes,window);
  NOISE_1;

  actual_n_cands = 0;

  /* nested loop to examine pairs of nodes */

  do_cdrs (p1, window_nodes) {
    do_cdrs (p2, CDR (p1)) {
      treeNode * cand_1 = PCAR(p1); 
      treeNode * cand_2 = PCAR(p2); 
      int score = compute_match_score(cand_1,cand_2);
      if (score >= 0) {
	c1[actual_n_cands] = cand_1;
	c2[actual_n_cands] = cand_2;
	scores[actual_n_cands] = score + 
	  /* goofy tie breaker rule */
	  0.9 + (-.01 * MAX(get_depth(cand_1), get_depth(cand_2)));
	actual_n_cands += 1;
      }	    
    }
  }
  NOISE_2;
  if (actual_n_cands > 0) {
    boolean succeeded_p;
    int i = actual_n_cands-1;
    /* sort those merge candidates */
    qsort_index_f (actual_n_cands, scores, perm);
  inner_loop:
    succeeded_p = perform_merge_if_legal (c1[perm[i]],c2[perm[i]],
					  root,trainset,all_nodes);
    NOISE_3;
    if (succeeded_p) {
      goto outer_loop;
    }
    else if (i > 0) {
      i -= 1;
      goto inner_loop;
    }
  }

#ifdef DOUBLE_WINDOW
  if (plist_length(all_nodes) > window) {
    window *= 2;
    /* fprintf(stderr,"doubling window to %d\n", window); */
    /* memory leak when using free */
    goto window_loop;
  }
#endif

#ifdef USE_FREE      
  plist_dispose(all_nodes);
  plist_dispose(window_nodes);
  my_free (c1, potential_n_cands*sizeof(treeNode *));
  my_free (c2, potential_n_cands*sizeof(treeNode *));
  my_free (scores, potential_n_cands * sizeof(float));
  my_free (perm,   potential_n_cands * sizeof(int));
#endif
}




/*********************************************************************/




long initial_seconds;


void init_millisecond_timer ()
{
  struct timeval t_work;
  gettimeofday(&t_work, NULL);
  initial_seconds = t_work.tv_sec;
}


/* this gives a timestamp in units of milliseconds */
/* the actual time resolution may be lower */


int millisecond_timer ()
{
  struct timeval t_work;
  int n_seconds;
  gettimeofday(&t_work, NULL);
  n_seconds = t_work.tv_sec - initial_seconds;
  return(n_seconds * 1000 + t_work.tv_usec / 1000);
}


/*********************************************************************/


int main (int argc, char **argv)
{
  treeNode * pig; 
  PP_Pair * training_set;
  int desired_window;
  int initial_n_nodes;

  int t1, t2, t3, t4;

  init_millisecond_timer ();

  if (argc != 3) {
    printf("usage: %s filename window\n", argv[0]);
    exit(-1);
  }

  desired_window = atoi(argv[2]);

  t1 = millisecond_timer ();

  training_set = read_cform_trainset (argv[1]);
  /*  fprintf(stderr, "max string len is %d\n", length_of_longest_string); */

  pig = new_build_tree (training_set);
  initial_n_nodes = global_node_counter;

#ifdef NOISY
  fprintf(stderr, "%d nodes allocated\n", initial_n_nodes); 
#endif

  t2 = millisecond_timer ();
  arb_collapse (pig, training_set, desired_window);
  t3 = millisecond_timer ();
  fancy_print_tree (pig); 
  t4 = millisecond_timer ();

#ifdef NOISY
  fprintf(stderr, "%d additional nodes allocated\n",
	  global_node_counter - initial_n_nodes);
  my_malloc_report(); 
  fprintf(stderr, "%d msec reading, %d msec crunching, %d msec writing\n",
	  t2-t1, t3-t2, t4-t3);
#endif

  return(0);

}

