# Copyright (C) 1995, Index Data I/S
# All rights reserved.
# Sebastian Hammer, Adam Dickmeiss
-# $Id: Makefile,v 1.7 1995-09-06 16:11:15 adam Exp $
+# $Id: Makefile,v 1.8 1995-09-11 13:09:31 adam Exp $
SHELL=/bin/sh
RANLIB=ranlib
$(TPROG2): $(O2) $(YAZLIB)
$(CC) $(CFLAGS) -o $(TPROG2) $(O2) $(YAZLIB)
-$(TPROG3): $(O3) ../lib/rset.a \
- ../lib/dict.a ../lib/isam.a ../lib/bfile.a \
- ../lib/dfa.a ../lib/alexutil.a $(YAZLIB)
- $(CC) $(CFLAGS) -o $(TPROG3) $(O3) ../lib/rset.a \
- ../lib/dict.a ../lib/isam.a ../lib/bfile.a \
+$(TPROG3): $(O3) \
+ ../lib/rset.a ../lib/dict.a ../lib/isam.a ../lib/bfile.a \
../lib/dfa.a ../lib/alexutil.a $(YAZLIB)
+ $(CC) $(CFLAGS) -o $(TPROG3) $(O3) \
+ ../lib/rset.a ../lib/dict.a ../lib/isam.a ../lib/bfile.a \
+ ../lib/dfa.a ../lib/alexutil.a $(YAZLIB) -lm
.c.o:
$(CC) -c $(DEFS) $(CFLAGS) $<
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: extract.c,v $
- * Revision 1.6 1995-09-08 14:52:27 adam
+ * Revision 1.7 1995-09-11 13:09:32 adam
+ * More work on relevance feedback.
+ *
+ * Revision 1.6 1995/09/08 14:52:27 adam
* Minor changes. Dictionary is lower case now.
*
* Revision 1.5 1995/09/06 16:11:16 adam
key_offset += sizeof(*k);
}
+#if !IT_KEY_HAVE_SEQNO
void key_write_x (struct strtab *t, int cmd, struct it_key *k, const char *str)
{
void **oldinfo;
if (strtab_src (t, str, &oldinfo))
- ((struct it_key *) *oldinfo)->seqno++;
+ ((struct it_key *) *oldinfo)->freq++;
else
{
*oldinfo = xmalloc (sizeof(*k));
memcpy (*oldinfo, k, sizeof(*k));
- ((struct it_key *) *oldinfo)->seqno = 1;
+ ((struct it_key *) *oldinfo)->freq = 1;
}
}
+#endif
void key_rec_flush (const char *str, void *info, void *data)
{
{
FILE *inf;
struct it_key k;
+#if IT_KEY_HAVE_SEQNO
int seqno = 1;
+#endif
int c;
char w[IT_MAX_WORD];
while ((c=getc (inf)) != EOF)
{
int i = 0;
- while (i < 254 && c != EOF && isalnum(c))
+ while (i < IT_MAX_WORD-1 && c != EOF && isalnum(c))
{
w[i++] = index_char_cvt (c);
c = getc (inf);
{
w[i] = 0;
- k.seqno = seqno++;
#if IT_KEY_HAVE_FIELD
k.field = 0;
#endif
+#if IT_KEY_HAVE_SEQNO
+ k.seqno = seqno++;
+ key_write (cmd, &k, w);
+#else
key_write_x (t, cmd, &k, w);
+#endif
}
if (c == EOF)
break;
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: index.h,v $
- * Revision 1.8 1995-09-08 14:52:27 adam
+ * Revision 1.9 1995-09-11 13:09:33 adam
+ * More work on relevance feedback.
+ *
+ * Revision 1.8 1995/09/08 14:52:27 adam
* Minor changes. Dictionary is lower case now.
*
* Revision 1.7 1995/09/06 16:11:16 adam
#include <isam.h>
#define IT_MAX_WORD 256
+#define IT_KEY_HAVE_SEQNO 1
#define IT_KEY_HAVE_FIELD 0
struct it_key {
int sysno;
+#if IT_KEY_HAVE_SEQNO
int seqno;
+#else
+ int freq;
+#endif
#if IT_KEY_HAVE_FIELD
int field;
#endif
void key_flush (void);
void key_write (int cmd, struct it_key *k, const char *str);
int key_compare (const void *p1, const void *p2);
+void key_logdump (int mask, const void *p);
void key_input (const char *dict_fname, const char *isam_fname,
const char *key_fname, int cache);
int key_sort (const char *key_fname, size_t mem);
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: kcompare.c,v $
- * Revision 1.4 1995-09-08 14:52:27 adam
+ * Revision 1.5 1995-09-11 13:09:34 adam
+ * More work on relevance feedback.
+ *
+ * Revision 1.4 1995/09/08 14:52:27 adam
* Minor changes. Dictionary is lower case now.
*
* Revision 1.3 1995/09/07 13:58:36 adam
#include "index.h"
+void key_logdump (int logmask, const void *p)
+{
+ struct it_key key;
+
+ memcpy (&key, p, sizeof(key));
+#if IT_KEY_HAVE_SEQNO
+ logf (logmask, "%7d s=%-3d", key.sysno, key.seqno);
+#else
+ logf (logmask, "%7d f=%-3d", key.sysno, key.freq);
+#endif
+}
+
int key_compare (const void *p1, const void *p2)
{
struct it_key i1, i2;
else
return -2;
}
+#if IT_KEY_HAVE_SEQNO
if (i1.seqno != i2.seqno)
{
if (i1.seqno > i2.seqno)
else
return -1;
}
+#else
+ if (i1.freq != i2.freq)
+ {
+ if (i1.freq > i2.freq)
+ return 1;
+ else
+ return -1;
+ }
+#endif
#if IT_KEY_HAVE_FIELD
if (i1.field != i2.field)
{
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: kdump.c,v $
- * Revision 1.4 1995-09-08 14:52:27 adam
+ * Revision 1.5 1995-09-11 13:09:35 adam
+ * More work on relevance feedback.
+ *
+ * Revision 1.4 1995/09/08 14:52:27 adam
* Minor changes. Dictionary is lower case now.
*
* Revision 1.3 1995/09/06 16:11:17 adam
struct it_key k;
memcpy (&k, key_info+1, sizeof(k));
- printf ("%7d op=%d s=%-3d %s\n", k.sysno, *key_info, k.seqno,
+#if IT_KEY_HAVE_SEQNO
+ printf ("%7d op=%d s=%-5d %s\n", k.sysno, *key_info, k.seqno,
+ key_string);
+#else
+ printf ("%7d op=%d f=%-3d %s\n", k.sysno, *key_info, k.freq,
key_string);
+
+#endif
}
if (fclose (inf))
{
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: zrpn.c,v $
- * Revision 1.8 1995-09-08 14:52:27 adam
+ * Revision 1.9 1995-09-11 13:09:35 adam
+ * More work on relevance feedback.
+ *
+ * Revision 1.8 1995/09/08 14:52:27 adam
* Minor changes. Dictionary is lower case now.
*
* Revision 1.7 1995/09/07 13:58:36 adam
#include <rstemp.h>
#include <rsnull.h>
#include <rsbool.h>
+#include <rsrel.h>
+
+int split_term (ZServerInfo *zi, Z_Term *term, ISAM_P **isam_ps, int *no)
+{
+ static ISAM_P isam_p[16];
+ int isam_p_indx = 0;
+ char termz[IT_MAX_WORD+1];
+ char term_sub[IT_MAX_WORD+1];
+ int sizez, i;
+ char *p0, *p1;
+ const char *info;
+
+ if (term->which != Z_Term_general)
+ return 0;
+ sizez = term->u.general->len;
+ if (sizez > IT_MAX_WORD)
+ sizez = IT_MAX_WORD;
+ for (i = 0; i<sizez; i++)
+ termz[i] = index_char_cvt (term->u.general->buf[i]);
+ termz[i] = '\0';
+
+ p0 = termz;
+ while (1)
+ {
+ if ((p1 = strchr (p0, ' ')))
+ {
+ memcpy (term_sub, p0, p1-p0);
+ term_sub[p1-p0] = '\0';
+ }
+ else
+ strcpy (term_sub, p0);
+ logf (LOG_DEBUG, "dict_lookup: %s", term_sub);
+ if ((info = dict_lookup (zi->wordDict, term_sub)))
+ {
+ logf (LOG_DEBUG, " found");
+ assert (*info == sizeof(*isam_p));
+ memcpy (isam_p + isam_p_indx, info+1, sizeof(*isam_p));
+ isam_p_indx++;
+ }
+ if (!p1)
+ break;
+ p0 = p1+1;
+ }
+ *isam_ps = isam_p;
+ *no = isam_p_indx;
+ logf (LOG_DEBUG, "%d positions", *no);
+ return 1;
+}
+
+static RSET rpn_search_APT_relevance (ZServerInfo *zi,
+ Z_AttributesPlusTerm *zapt)
+{
+ rset_relevance_parms parms;
+
+ parms.key_size = sizeof(struct it_key);
+ parms.max_rec = 100;
+ parms.cmp = key_compare;
+ parms.is = zi->wordIsam;
+ split_term (zi, zapt->term, &parms.isam_positions,
+ &parms.no_isam_positions);
+ if (parms.no_isam_positions > 0)
+ return rset_create (rset_kind_relevance, &parms);
+ else
+ return rset_create (rset_kind_null, NULL);
+}
static RSET rpn_search_APT (ZServerInfo *zi, Z_AttributesPlusTerm *zapt)
{
+#if 0
+ Z_Term *term = zapt->term;
char termz[IT_MAX_WORD+1];
size_t sizez;
struct rset_isam_parms parms;
const char *info;
int i;
- Z_Term *term = zapt->term;
if (term->which != Z_Term_general)
return NULL;
parms.is = zi->wordIsam;
logf (LOG_DEBUG, "rset_create isam");
return rset_create (rset_kind_isam, &parms);
+#else
+ return rpn_search_APT_relevance (zi, zapt);
+#endif
}
static RSET rpn_search_ref (ZServerInfo *zi, Z_ResultSetId *resultSetId)
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: rsrel.c,v $
- * Revision 1.1 1995-09-08 14:52:42 adam
+ * Revision 1.2 1995-09-11 13:09:41 adam
+ * More work on relevance feedback.
+ *
+ * Revision 1.1 1995/09/08 14:52:42 adam
* Work on relevance feedback.
*
*/
#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
#include <assert.h>
#include <isam.h>
static void relevance (struct rset_rel_info *info, rset_relevance_parms *parms)
{
- char *isam_buf;
+ char **isam_buf;
+ char *isam_tmp_buf;
int *isam_r;
+ int *max_tf;
ISPT *isam_pt;
+ double *wgt;
int i;
- isam_buf = xmalloc (info->key_size * parms->no_isam_positions);
+ logf (LOG_DEBUG, "relevance");
+ isam_buf = xmalloc (parms->no_isam_positions * sizeof(*isam_buf));
isam_r = xmalloc (sizeof (*isam_r) * parms->no_isam_positions);
isam_pt = xmalloc (sizeof (*isam_pt) * parms->no_isam_positions);
+ isam_tmp_buf = xmalloc (info->key_size);
+ max_tf = xmalloc (sizeof (*max_tf) * parms->no_isam_positions);
+ wgt = xmalloc (sizeof (*wgt) * parms->no_isam_positions);
for (i = 0; i<parms->no_isam_positions; i++)
{
+ isam_buf[i] = xmalloc (info->key_size);
isam_pt[i] = is_position (parms->is, parms->isam_positions[i]);
- isam_r[i] = is_readkey (isam_pt[i], isam_buf + i*info->key_size);
+ max_tf [i] = is_numkeys (isam_pt[i]);
+ isam_r[i] = is_readkey (isam_pt[i], isam_buf[i]);
+ logf (LOG_DEBUG, "max tf %d = %d", i, max_tf[i]);
}
+ while (1)
+ {
+ int min = -1, i;
+ double length, similarity;
+
+ /* find min with lowest sysno */
+ for (i = 0; i<parms->no_isam_positions; i++)
+ if (isam_r[i] &&
+ (min < 0 || (*parms->cmp)(isam_buf[i], isam_buf[min]) < 1))
+ min = i;
+ if (min < 0)
+ break;
+ memcpy (isam_tmp_buf, isam_buf[min], info->key_size);
+ logf (LOG_LOG, "calc rel for");
+ key_logdump (LOG_LOG, isam_tmp_buf);
+ /* calculate for all with those sysno */
+ length = 0.0;
+ for (i = 0; i<parms->no_isam_positions; i++)
+ {
+ int r;
+ if (isam_r[i])
+ r = (*parms->cmp)(isam_buf[i], isam_tmp_buf);
+ else
+ r = 2;
+ if (r > 1 || r < -1)
+ wgt[i] = 0.0;
+ else
+ {
+ int tf = 0;
+ do
+ {
+ tf++;
+ isam_r[i] = is_readkey (isam_pt[i], isam_buf[i]);
+ } while (isam_r[i] &&
+ (*parms->cmp)(isam_buf[i], isam_tmp_buf) <= 1);
+ logf (LOG_DEBUG, "tf%d = %d", i, tf);
+ wgt[i] = 0.5+tf*0.5/max_tf[i];
+ length += wgt[i] * wgt[i];
+ }
+ }
+ /* calculate relevance value */
+ length = sqrt (length);
+ similarity = 0.0;
+ for (i = 0; i<parms->no_isam_positions; i++)
+ similarity += wgt[i]/length;
+ logf (LOG_LOG, " %f", similarity);
+ /* if value is in the top score, then save it - don't emit yet */
+ }
for (i = 0; i<parms->no_isam_positions; i++)
+ {
is_pt_free (isam_pt[i]);
+ xfree (isam_buf[i]);
+ }
+ xfree (max_tf);
+ xfree (isam_tmp_buf);
xfree (isam_buf);
xfree (isam_r);
xfree (isam_pt);
+ xfree (wgt);
}
static rset_control *r_create (const struct rset_control *sel, void *parms)