/******************************************************************************/
/* psx-smi.c                                  Stochastical Matching Interface */
/******************************************************************************/
/** @file psx-smi.c Stochastical Matching Interface - Source Code File
 * Typedefinition of SMI Component structure and functions providing the usage
 * of stochastical matching.
 */
 
#include "psx.h"

#include <math.h>

/******************************************************************************/
/*                                                                            */
/******************************************************************************/

/* for logging: */
#define MOD PSX_MOD_SMI

/******************************************************************************/
/* Private                                                                    */
/******************************************************************************/

/** Components */
typedef struct {
 IDX idx;     /**< key value for this attribute */
 PSX_STR atr; /**< name of attribute            */
 DTP dtp;     /**< datatype of characteristic   */
} SMI_CMP;

/******************************************************************************/
/*                                                                            */
/******************************************************************************/

/** global DBI-pointer, set by smi_init()  */
static DBI * dbi = NULL;

/** array of all components                */
static SMI_CMP * cmps = NULL;
/** number of components                   */
static NUM num_cmps = 0;

/** thresholds for smi_decide              */
static double threshold1 = -1.0;
static double threshold2 = 1.0;

/******************************************************************************/
/*                                                                            */
/******************************************************************************/
/** initialize smi: get global dbi pointer and create copy of table cmp 
 */
 
static BOOL smi_init ()
{
 if (!pdi_get_dbi (&dbi))                   /* fetch dbi pointer */
  return (FALSE);

 if (!dbi_select (dbi,"cmp","*",NULL))      /* read table "cmp"  */
  return (FALSE);

 cmps = NULL;
 num_cmps = 0;
 
 while(TRUE)
 {
  char * idxstr;
  char * atr;
  char * dtpstr;

  if (!dbi_fetch (dbi,0,NULL))
   break;                    /* end of table "cmp" => abort */

  /* else: read new component: */

  num_cmps++;              /* increase number of components */

  
  if ((cmps = realloc (cmps,sizeof(SMI_CMP)*num_cmps)) == NULL) /* reallocate array */
   return (FALSE);
 
  /* read values: */

  if (!dbi_fetch (dbi,0,&idxstr))
   return (FALSE);
  cmps[num_cmps - 1].idx = atoi (idxstr);
  /* Error-Check: the value could also be set like this:) */
  assert (cmps[num_cmps - 1].idx == num_cmps - 1);

  if (!dbi_fetch (dbi,1,&atr))
   return (FALSE);
  strcpy (cmps[num_cmps - 1].atr,atr);
  
  if (!dbi_fetch (dbi,2,&dtpstr))
   return (FALSE);
  cmps[num_cmps - 1].dtp = atoi (dtpstr);
 }

 return (TRUE);
}

/******************************************************************************/
/** free memory 
 */
 
static BOOL smi_exit ()
{
 if (cmps != NULL)
 {
  free (cmps);
  cmps = NULL;
  dbi = NULL;
 }
 return (TRUE);
}

/******************************************************************************/
/** get int value from table
 * @param select string pointer to sql statement
 * @param n pointer to integer value after invocation
 */

static BOOL smi_sql_get_int (const char * select,int * n)
{
 NUM tuples;
 char * buf;

 assert (n != 0);
 
 if (!dbi_execute (dbi,select))  /* execute SELECT  */
  return (FALSE);
 
 if (!dbi_fetch (dbi,0,0))       /* read value      */
  return (FALSE);

 if (!dbi_fetch (dbi,0,&buf))
  return (FALSE);

 
 *n = atoi (buf);                /* convert value and return */
 return (TRUE);
}

/******************************************************************************/
/** increase frequency of an entry (in table "frq") with attribute specified
 *  via index 
 * @param idx integer index
 * @param val pointer to valuestring
 */
static BOOL smi_inc_by_idx (int idx,char * val)
{
 PSX_STR sql;
 NUM tuples;
 
 /* build SELECT statement */
 sprintf (sql,"SELECT (freq) FROM frq WHERE cmp=%d AND val='%s'",idx,val);
 if (!dbi_execute (dbi,sql))
  return (FALSE);
 
 if (!dbi_fetch(dbi,0,NULL)) /* new entry */
 {
  /* execute INSERT */
  sprintf (sql,"INSERT INTO frq VALUES (%d, '%s', 1)",idx,val);
  if (!dbi_execute (dbi,sql))
   return (FALSE);
 }
 else /* increment entry */
 {
  /* execute UPDATE  */
  sprintf (sql,"UPDATE frq SET freq=freq+1 WHERE cmp=%d AND val='%s'",idx,val);
  if (!dbi_execute (dbi,sql))
   return (FALSE);
 }

 return (TRUE);
}

/******************************************************************************/
/**decrease frequency of an entry, attribute via index
 * @param idx integer index
 * @param val pointer to valuestring
 */
 
static BOOL smi_dec_by_idx (int idx,char * val)
{
 PSX_STR sql;
 int freq;
 NUM tuples;
 
 sprintf (sql,"SELECT (freq) FROM frq WHERE cmp=%d AND val='%s'",idx,val);
  
 if (!smi_sql_get_int (sql,&freq))    /* read frequency value */
  return (FALSE);

 if (freq == 1)
 {
  /* delete entry */
  sprintf (sql,"DELETE FROM frq WHERE cmp=%d AND val='%s'",idx,val);
  if (!dbi_execute (dbi,sql))
   return (FALSE);
 }
 else
 {
  /* decrement frequency */
  sprintf (sql,"UPDATE frq SET freq=freq-1 WHERE cmp=%d AND val='%s'",idx,val);
  if (!dbi_execute (dbi,sql))
   return (FALSE);
 }

 return (TRUE);
}

/******************************************************************************/
/** create entry in frequencytable for a characteristic
 * @param sym pointer to symbolstring
 * @param dtp datatype
 * @param integer index
 */
 
static BOOL smi_make_cmp (char * sym, DTP dtp, int idx)
{
 PSX_STR sql;
 int i;
 char ** bufs;
 NUM num;
 BOOL r = TRUE;

 /* enter new component into table "cmp" */
 sprintf (sql,"INSERT INTO cmp VALUES (%d, '%s', %d)",idx,sym,dtp);
 if (!dbi_execute (dbi,sql))
  return (FALSE);

 /* read from table "rec" */
 sprintf(sql,"SELECT (f_%s) FROM rec",sym);
 if (!dbi_execute (dbi,sql))
  return (FALSE);

 bufs = NULL;
 num = 0;

 while(TRUE)
 {
  if (!dbi_fetch (dbi,0,NULL))
   break; /* end of table => abort */

  /* reallocate array & read new value   */

  num++;
  if ((bufs = realloc(bufs,num*sizeof(char *))) == NULL)
   return (FALSE);
  
  if (!dbi_fetch (dbi,0,&bufs[num - 1]))
   return (FALSE);
 }

 /* update frequencytable */
 for (i = 0; r && i != num; i++)
 {
  if (!smi_inc_by_idx (idx,bufs[i]))
   return (FALSE);
 }

 free (bufs);

 return (TRUE);
}

/******************************************************************************/
/** get index for attribute from table "cmp"
 * @param atr string pointer to attibute
 * @param idx index pointer
 */
static BOOL smi_get_idx (char * atr, IDX * idx)
{
 IDX i;
  
 for (i = 0; i < num_cmps; i++)         /* look for attribut */
 {
  if (!strcmp (cmps[i].atr, atr))       /* found             */
  {        
   *idx = i;
   return (TRUE);
  }
 }
 return (FALSE);
}

/******************************************************************************/
/** get number of entries of attribute atr
 * @param n pointer to integer index 
 */
 
static BOOL smi_get_total (int * n)
{
 return (smi_sql_get_int ("SELECT COUNT(*) FROM rec", n));
}

/******************************************************************************/
/** get frequency of the entry with specified value and attribute
 * @param cmp value component for attribute cmp in database query
 * @param val string pointer to valuestring
 * @param n pointer to integer frequency after invocation
 */
 
static BOOL smi_get_freq (IDX cmp, char * val, int * n)
{
 PSX_STR sql;

 assert (cmp < num_cmps && val != NULL && n != NULL);

 sprintf (sql, "SELECT (freq) FROM frq WHERE cmp=%d AND val='%s'", cmp, val);
 if (!smi_sql_get_int (sql, n)) *n = 0;
 return (TRUE);
}

/******************************************************************************/
/** calculate relative frequency of an entry
 * @param cmp component
 * @param val string pointer to valuestring
 * @param rf double pointer to relative frequency after invocation
 * @param total integer divisor of retrieved frequency in order to get rf
 */
static BOOL smi_rel_frq (IDX cmp, char * val, double * rf, int total)
{
 int frq;
        
 assert (cmp < num_cmps && val != NULL && rf != NULL && total != 0);

 /* get frequency */
 if (!smi_get_freq (cmp, val, &frq))
  return (FALSE);

 /* get relative frequency */
 *rf = ((double)frq)/((double)total);
 
 return (TRUE);
}

/******************************************************************************/
/** return error rate for components cmp (always returns 5%)
 * @param cmp component
 * @param double pointer to rate after invication
 */
 
static BOOL smi_error_rate(IDX cmp, double * rate)
{
 assert (cmp < num_cmps && rate != NULL);
 
 *rate = 0.05; /* Dummy-Funktion: Gibt immer 5% zurck. */
 return (TRUE);
}

/******************************************************************************/
/** check peculiarity on equality, depending on variabletype
 * @param dtp datatype
 * @param val1 string pointer to first valuestring
 * @param val2 string pointer to secod value string
 */
static BOOL smi_equal (DTP dtp, char * val1, char * val2)
{
 assert (val1 != NULL && val2 != NULL);

 switch (dtp)
 {
 case DTP_DD:
 case DTP_DM:
 case DTP_DY:
 {
  int n1, n2;
 
  /* numerical value: convert & check with "==" : */
  n1 = atoi (val1);
  n2 = atoi (val2);
  return n1 == n2;
 }
 /*:BG:begin:datatype date:yyyy-mm-dd*/
 case DTP_DATE:
 {
	;
 }
 /*:BG:end*/
 default:
  /* else: String comparison, case insensitive */
#if defined(HAVE_STRCASECMP) /* checked by configure */
  return !strcasecmp (val1,val2);
#elif defined(HAVE_STRICMP)  /* checked by configure */
  return !stricmp (val1,val2);
#else
	 ;
  /* either one or the other! */
//# error "Need strcasecmp() or stricmp()!"
#endif
 }
}

/******************************************************************************/
/** calculate weight for a single attribute
 * @param cmp component
 * @param val1 string pointer to first value
 * @param val2 string pointer to second value
 * @param total integer value of toal frequency
 * @param double double pointer to weight value
 */
 
static BOOL smi_weight_one (IDX cmp, char * val1, char * val2, int total, double * weight)
{
 double ex, ey, px, py, n;
 DTP dtp;
 const double delta = 0.5; 
 PSX_DST * dst;           
 
 assert (cmp < num_cmps && val1 != NULL && val2 != NULL && weight != NULL);

 *weight = 0.0;

 
 if (!smi_error_rate (cmp,&ex))     /* get error rate for ex */
  return (FALSE);
 if (!smi_error_rate (cmp,&ey))     /* get error rate for ey */
  return (FALSE);

 if (!dst_get (cmps[cmp].atr,&dst)) /* get distribution      */
  return (FALSE);
 dtp = cmps[cmp].dtp;

 switch (dst -> dst)              /* check distribution type */
 {
 case DST_UNI:                  /* homogeneously distributed */
  n = dst -> n;
  if (smi_equal (dtp, val1, val2))          /* values equal? */
   *weight = log (n);
  else
  {
   *weight = log (ex + ey);
   if (n < 6) *weight += log (1 + 1/(n - 1));
  }
  break;
 case DST_FIX:          /* fix probabilities for peculiarity */
  n = dst -> n;
  if (!dst_prob (dst, val1, &px))  /* get probbility         */
   return (FALSE);
  if (smi_equal (dtp, val1, val2)) /* values equal?          */
   *weight = -log (px);
  else
  {
   if (!dst_prob (dst, val2, &py)) return (FALSE);
   *weight = log (ex/px + ey/py) - log (dst -> n - 1);
  }
  break;
 case DST_REL:                              /* over relative frequency */
  if (!smi_rel_frq (cmp, val1, &px, total)) /* get relative frequency  */
   return (FALSE);
  if (smi_equal (dtp, val1, val2))          /* values equal?           */
   *weight = -log (px);
  else
  {
   if (!smi_rel_frq (cmp, val2, &py, total)) /* get 2nd rel. frq.      */
    return (FALSE);
   if (total == 1)                     /* exception: only one value */
    n = 1.5;
   else
    n = total;
   if (px == 0.0)                      /* exception */
    px = delta;
   if (py == 0.0)                      /* exception */
    py = delta; 
   *weight = log (ex/px + ey/py) - log (n - 1);
  }
  break;
 default:
  /* else: wrong distribution type */
  LOG (MOD, "Unknown distribution type!");
  return (FALSE);
 }

 return (TRUE);
}

/******************************************************************************/
/** sort fields of input record (from request) in order like in cmps[]
 * @param in pointer to input record
 * @param out pointer to output record
 */
 
static BOOL smi_sort_rec (PSX_REC * in, PSX_REC * out)
{
 IDX i;

 assert (in != NULL && out != NULL);

 for (i = 0; i < num_cmps; i++)          /* loop over all components */
 {
  char * val;

  if (!rec_get (in, cmps[i].atr, &val))  /* get entry from record    */
  {
   LOG (MOD, "ERROR: Required attribute missing in request!");
   return (FALSE);
  }
  if (!rec_add1 (out, cmps[i].atr, val)) /* insert entry ino output record  */
   return (FALSE);
 }

 return (TRUE);
}

/******************************************************************************/
/** weight function for all attributes
 * @param rec1 pointer to first record
 * @param rec2 pointer to 2nd record
 * @param total integer value of total frequency
 * @param weight double pointer to weight value after invocation
 */
 
static BOOL smi_weight_rec (PSX_REC * rec1, PSX_REC * rec2, int total, double * weight)
{
 IDX i;

 assert (rec1 != NULL && rec2 != NULL && weight != NULL);

 *weight = 0.0;

 for (i = 0; i != rec1 -> n; i++)          /* loop over all attributes */
 {
  double w;

                                      /* calculate weight individually */
  if (!smi_weight_one (i, rec1 -> itm[i].val, rec2 -> itm[i].val, total, &w))
   return (FALSE);
  
  /* Einzelgewichte addieren: */
  *weight += w;
 }
 
 return (TRUE);
}

/******************************************************************************/
/** decision procedure
 * @param rec1 pointer to first record
 * @param rec2 pointer to 2nd record
 * @param total integer value of total frequency
 * @param status integer pointer to status after invocation, possible values are:
 *               1 equal
 *               0 undecided
 *              -1 not equal
 */
 
static BOOL smi_decide (PSX_REC * rec1, PSX_REC * rec2, int total, int * status)
{
 double weight;

 assert (rec1 != NULL && rec2 != NULL && status != NULL);

 if (!smi_weight_rec (rec1, rec2, total, &weight))    /* calculate weight */
  return (FALSE);

 /* compare weight with threshold */
 if (weight <= threshold1)
  *status = -1;
 else if (weight < threshold2)
  *status = 0;
 else
  *status = 1;
 
 return (TRUE);
}

/******************************************************************************/
/* Public                                                                    */
/******************************************************************************/
/** create frequency tables
 * @param sch pointer to schema
 */
 
BOOL smi_register (PSX_SCH * sch)
{
 PSX_FSP * fsp;
 IDX i;
 int cmp_idx;

 assert (sch != NULL);

 if (!pdi_get_dbi (&dbi))       /* get dbi-Pointer */
  return (FALSE);

 fsp = sch -> fsp;

 if (fsp -> n <= 0)
 {
  LOG(MOD, "No fields in format spec!");
  return (FALSE);
 }

 /* create (empty) tables */
 if (!dbi_execute (dbi, "CREATE TABLE cmp (idx INT PRIMARY KEY, atr VARCHAR(80) NOT NULL, dtp INT, UNIQUE (idx, atr))"))
  return (FALSE);
 if (!dbi_execute (dbi, "CREATE TABLE frq (cmp INT REFERENCES cmp (idx), val TEXT NOT NULL, freq INT)"))
  return (FALSE);
 
 cmp_idx = 0;
 for (i = 0; i < fsp -> n; i++)
 {
  FMT_ITM * fi = &fsp -> itm[i];
  PSX_STL * stl;
  IDX j;

  if (!stl_create (&stl))                /* create string list        */
   return (FALSE);
 
  if (!sch_itm_get_stl (sch, i, stl))    /* get attributes for item i from schema */
   return (FALSE);

  for (j = 0; j < stl -> n; j++)         /* loop over all attributes  */
  {
   char * sym;

   if (!stl_get (stl, j, &sym))          /* get string from list      */
    return (FALSE);
   if (!smi_make_cmp (sym, fi -> dtp, cmp_idx++)) /* insert component */
    return (FALSE);
  }

  stl_delete (&stl);                     /* free list                 */
 }

 return (TRUE);
}

/******************************************************************************/
/** update table frq
 * @param rec_old pointer to old record
 * @param rec_new pointer to new record after invocation
 */
 
BOOL smi_update (PSX_REC * rec_old, PSX_REC * rec_new)
{
 BOOL r;
 IDX i;
 PSX_STR sql;

 assert (rec_old != NULL && rec_new != NULL && rec_new -> n == rec_old -> n);
 
 if (!smi_init ())                               /* initialize smi      */
  return (FALSE); 

 if(!psx_cfg_get(PSX_CFG_SQL_BEGIN, sql)) // BG
	 strcpy(sql, "BEGIN");

 dbi_execute (dbi, sql);

 for (i = 0, r = TRUE; r && i < num_cmps; i++)   /* over all components */
 {
  char * oldval;
  char * newval;

  if (!rec_get (rec_old, cmps[i].atr, &oldval)) /* get value of attribute from old record */
  {
   LOG (MOD, "Required attribute not found in new record!");
   r = FALSE;
   break;
  }
  if (!rec_get (rec_new, cmps[i].atr, &newval)) /* get value of attribute from new record */
  {
   LOG (MOD, "Required attribute not found in new record!");
   r = FALSE;
   break;
  }
  if (!smi_dec_by_idx (i, oldval)) /* decrement old value */
  {
   r = FALSE;
   break;
  }
  if (!smi_inc_by_idx (i, newval)) /* increment new value */
  {
   r = FALSE;
   break;
  }
 }

 if(!psx_cfg_get (PSX_CFG_SQL_COMMIT,sql)) //BG
	 	 strcpy(sql,"COMMIT");    //// BG

 if (!dbi_execute (dbi, sql)) r = FALSE;

 smi_exit ();

 if (!r)
  LOG (MOD, "update failed");

 return (r);
}

/******************************************************************************/
/** increase frequency for record
 * @param rec pointer to record
 */
 
BOOL smi_insert (PSX_REC * rec)
{
 int i;
 int r = TRUE;
 PSX_STR sql;

 assert (rec != NULL);
 
 if (!smi_init ())                            /* initialize smi             */
  return (FALSE);

 if(!psx_cfg_get (PSX_CFG_SQL_BEGIN,sql)) //BG
	 	 strcpy(sql,"BEGIN");             //

 dbi_execute(dbi, sql);
 
 for (i = 0; i != rec -> n; i++)              /* over all entries in record */
 {
  IDX idx;

  if (!smi_get_idx (rec -> itm[i].atr, &idx)) /* get index of attribute     */
  {
   psx_put ("get_idx failed (%s)", rec -> itm[i].atr);
   r = FALSE;
   break;
  }
  if (!smi_inc_by_idx (idx, rec -> itm[i].val)) /* increase value           */
  {
   psx_put ("inc_by_idx failed");
   r = FALSE;
   break;
  }
 }
 
 if(!psx_cfg_get (PSX_CFG_SQL_COMMIT,sql))    //BG
	 	 strcpy(sql,"COMMIT");                //

 if (!dbi_execute(dbi, sql)) r = FALSE;

 smi_exit ();

 if (!r) LOG(MOD, "insert failed");
 
 return (r);
}

/*****************************************************************************/  
/**** FUNCTION FOR TEST USAGE ****/

BOOL smi_test (PDI_REQ * req)
{
 IDX i;
 PSX_REC * rec1;
 PSX_REC * rec2;
 int total;

 LOG (MOD,"start testing");

 assert (req != NULL && req -> rec != NULL);

 if (!smi_init ())
  return (FALSE);

 if (!smi_get_total (&total))
  return (FALSE); 

 if (!rec_create (&rec1))
  return (FALSE);

 if (!rec_create (&rec2))
  return (FALSE);

 if (!smi_sort_rec (req -> rec,rec1))
  return (FALSE);

 if (!dbi_select (dbi,"rec","*",NULL))
  return (FALSE);

 for (i = 0; i < total; i++)
 {
  char * val;
  IDX j;
  double weight;
  NUM fields;

  if (!dbi_fetch (dbi,0,NULL))
   return (FALSE);

  if (!dbi_get_fields (dbi,&fields))
   return (FALSE);
 
  for (j = 3; j < fields; j++)
  {
   char * val;

   if (!dbi_fetch (dbi,j,&val))
   {
    psx_put ("fetch() failed!");
    return (FALSE);
   }

   if (!rec_add1 (rec2,cmps[j - 2].atr,val))
    return (FALSE);
  }

  if (!dbi_push (dbi))
  {
   psx_put ("dbi_push() failed!");
   return (FALSE);
  }

  if (!smi_weight_rec (rec1,rec2,total,&weight))
  {
   psx_put ("smi_weight_rec failed!");
   return (FALSE);
  }

#if 0
  psx_put ("weight: %f",weight);

  if (weight > 0)
  {
   psx_put ("possible match:");
   for (j = 0; j < rec1 -> n; j++)
   {
    psx_put ("%s: %s <-> %s",  rec1 -> itm[j].atr, rec1 -> itm[j].val, rec2 -> itm[j].val);
   }
  }
#endif

  if (!dbi_pop (dbi))
  {
   psx_put ("dbi_pop() failed!");
   return (FALSE);
  }

  rec_clear (rec2);
 }

 rec_delete (&rec1);
 rec_delete (&rec2);

 smi_exit ();

 LOG (MOD,"end testing");

 return (TRUE);
}

/******************************************************************************/
/******************************************************************************/
/******************************************************************************/





