From: Adam Dickmeiss Date: Mon, 11 Sep 1995 13:09:31 +0000 (+0000) Subject: More work on relevance feedback. X-Git-Tag: ZEBRA.1.0~761 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=0b7781e8c428a4bca098fff6f074fef83c06e560;p=idzebra-moved-to-github.git More work on relevance feedback. --- diff --git a/index/Makefile b/index/Makefile index ec6c61b..b9c1bf8 100644 --- a/index/Makefile +++ b/index/Makefile @@ -1,7 +1,7 @@ # Copyright (C) 1995, Index Data I/S # All rights reserved. # Sebastian Hammer, Adam Dickmeiss -# $Id: Makefile,v 1.7 1995-09-06 16:11:15 adam Exp $ +# $Id: Makefile,v 1.8 1995-09-11 13:09:31 adam Exp $ SHELL=/bin/sh RANLIB=ranlib @@ -27,12 +27,12 @@ $(TPROG1): $(O1) ../lib/dict.a \ $(TPROG2): $(O2) $(YAZLIB) $(CC) $(CFLAGS) -o $(TPROG2) $(O2) $(YAZLIB) -$(TPROG3): $(O3) ../lib/rset.a \ - ../lib/dict.a ../lib/isam.a ../lib/bfile.a \ - ../lib/dfa.a ../lib/alexutil.a $(YAZLIB) - $(CC) $(CFLAGS) -o $(TPROG3) $(O3) ../lib/rset.a \ - ../lib/dict.a ../lib/isam.a ../lib/bfile.a \ +$(TPROG3): $(O3) \ + ../lib/rset.a ../lib/dict.a ../lib/isam.a ../lib/bfile.a \ ../lib/dfa.a ../lib/alexutil.a $(YAZLIB) + $(CC) $(CFLAGS) -o $(TPROG3) $(O3) \ + ../lib/rset.a ../lib/dict.a ../lib/isam.a ../lib/bfile.a \ + ../lib/dfa.a ../lib/alexutil.a $(YAZLIB) -lm .c.o: $(CC) -c $(DEFS) $(CFLAGS) $< diff --git a/index/extract.c b/index/extract.c index 2050980..ce4ed96 100644 --- a/index/extract.c +++ b/index/extract.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: extract.c,v $ - * Revision 1.6 1995-09-08 14:52:27 adam + * Revision 1.7 1995-09-11 13:09:32 adam + * More work on relevance feedback. + * + * Revision 1.6 1995/09/08 14:52:27 adam * Minor changes. Dictionary is lower case now. * * Revision 1.5 1995/09/06 16:11:16 adam @@ -129,19 +132,21 @@ void key_write (int cmd, struct it_key *k, const char *str) key_offset += sizeof(*k); } +#if !IT_KEY_HAVE_SEQNO void key_write_x (struct strtab *t, int cmd, struct it_key *k, const char *str) { void **oldinfo; if (strtab_src (t, str, &oldinfo)) - ((struct it_key *) *oldinfo)->seqno++; + ((struct it_key *) *oldinfo)->freq++; else { *oldinfo = xmalloc (sizeof(*k)); memcpy (*oldinfo, k, sizeof(*k)); - ((struct it_key *) *oldinfo)->seqno = 1; + ((struct it_key *) *oldinfo)->freq = 1; } } +#endif void key_rec_flush (const char *str, void *info, void *data) { @@ -153,7 +158,9 @@ void text_extract (struct strtab *t, SYSNO sysno, int cmd, const char *fname) { FILE *inf; struct it_key k; +#if IT_KEY_HAVE_SEQNO int seqno = 1; +#endif int c; char w[IT_MAX_WORD]; @@ -168,7 +175,7 @@ void text_extract (struct strtab *t, SYSNO sysno, int cmd, const char *fname) while ((c=getc (inf)) != EOF) { int i = 0; - while (i < 254 && c != EOF && isalnum(c)) + while (i < IT_MAX_WORD-1 && c != EOF && isalnum(c)) { w[i++] = index_char_cvt (c); c = getc (inf); @@ -177,11 +184,15 @@ void text_extract (struct strtab *t, SYSNO sysno, int cmd, const char *fname) { w[i] = 0; - k.seqno = seqno++; #if IT_KEY_HAVE_FIELD k.field = 0; #endif +#if IT_KEY_HAVE_SEQNO + k.seqno = seqno++; + key_write (cmd, &k, w); +#else key_write_x (t, cmd, &k, w); +#endif } if (c == EOF) break; diff --git a/index/index.h b/index/index.h index 805abde..ac519d2 100644 --- a/index/index.h +++ b/index/index.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: index.h,v $ - * Revision 1.8 1995-09-08 14:52:27 adam + * Revision 1.9 1995-09-11 13:09:33 adam + * More work on relevance feedback. + * + * Revision 1.8 1995/09/08 14:52:27 adam * Minor changes. Dictionary is lower case now. * * Revision 1.7 1995/09/06 16:11:16 adam @@ -37,11 +40,16 @@ #include #define IT_MAX_WORD 256 +#define IT_KEY_HAVE_SEQNO 1 #define IT_KEY_HAVE_FIELD 0 struct it_key { int sysno; +#if IT_KEY_HAVE_SEQNO int seqno; +#else + int freq; +#endif #if IT_KEY_HAVE_FIELD int field; #endif @@ -63,6 +71,7 @@ int key_close (void); void key_flush (void); void key_write (int cmd, struct it_key *k, const char *str); int key_compare (const void *p1, const void *p2); +void key_logdump (int mask, const void *p); void key_input (const char *dict_fname, const char *isam_fname, const char *key_fname, int cache); int key_sort (const char *key_fname, size_t mem); diff --git a/index/kcompare.c b/index/kcompare.c index 6f9bbfd..bb37339 100644 --- a/index/kcompare.c +++ b/index/kcompare.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: kcompare.c,v $ - * Revision 1.4 1995-09-08 14:52:27 adam + * Revision 1.5 1995-09-11 13:09:34 adam + * More work on relevance feedback. + * + * Revision 1.4 1995/09/08 14:52:27 adam * Minor changes. Dictionary is lower case now. * * Revision 1.3 1995/09/07 13:58:36 adam @@ -31,6 +34,18 @@ #include "index.h" +void key_logdump (int logmask, const void *p) +{ + struct it_key key; + + memcpy (&key, p, sizeof(key)); +#if IT_KEY_HAVE_SEQNO + logf (logmask, "%7d s=%-3d", key.sysno, key.seqno); +#else + logf (logmask, "%7d f=%-3d", key.sysno, key.freq); +#endif +} + int key_compare (const void *p1, const void *p2) { struct it_key i1, i2; @@ -43,6 +58,7 @@ int key_compare (const void *p1, const void *p2) else return -2; } +#if IT_KEY_HAVE_SEQNO if (i1.seqno != i2.seqno) { if (i1.seqno > i2.seqno) @@ -50,6 +66,15 @@ int key_compare (const void *p1, const void *p2) else return -1; } +#else + if (i1.freq != i2.freq) + { + if (i1.freq > i2.freq) + return 1; + else + return -1; + } +#endif #if IT_KEY_HAVE_FIELD if (i1.field != i2.field) { diff --git a/index/kdump.c b/index/kdump.c index d6f2555..d13d7a4 100644 --- a/index/kdump.c +++ b/index/kdump.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: kdump.c,v $ - * Revision 1.4 1995-09-08 14:52:27 adam + * Revision 1.5 1995-09-11 13:09:35 adam + * More work on relevance feedback. + * + * Revision 1.4 1995/09/08 14:52:27 adam * Minor changes. Dictionary is lower case now. * * Revision 1.3 1995/09/06 16:11:17 adam @@ -85,8 +88,14 @@ int main (int argc, char **argv) struct it_key k; memcpy (&k, key_info+1, sizeof(k)); - printf ("%7d op=%d s=%-3d %s\n", k.sysno, *key_info, k.seqno, +#if IT_KEY_HAVE_SEQNO + printf ("%7d op=%d s=%-5d %s\n", k.sysno, *key_info, k.seqno, + key_string); +#else + printf ("%7d op=%d f=%-3d %s\n", k.sysno, *key_info, k.freq, key_string); + +#endif } if (fclose (inf)) { diff --git a/index/zrpn.c b/index/zrpn.c index 6a30f80..51d8510 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: zrpn.c,v $ - * Revision 1.8 1995-09-08 14:52:27 adam + * Revision 1.9 1995-09-11 13:09:35 adam + * More work on relevance feedback. + * + * Revision 1.8 1995/09/08 14:52:27 adam * Minor changes. Dictionary is lower case now. * * Revision 1.7 1995/09/07 13:58:36 adam @@ -44,15 +47,81 @@ #include #include #include +#include + +int split_term (ZServerInfo *zi, Z_Term *term, ISAM_P **isam_ps, int *no) +{ + static ISAM_P isam_p[16]; + int isam_p_indx = 0; + char termz[IT_MAX_WORD+1]; + char term_sub[IT_MAX_WORD+1]; + int sizez, i; + char *p0, *p1; + const char *info; + + if (term->which != Z_Term_general) + return 0; + sizez = term->u.general->len; + if (sizez > IT_MAX_WORD) + sizez = IT_MAX_WORD; + for (i = 0; iu.general->buf[i]); + termz[i] = '\0'; + + p0 = termz; + while (1) + { + if ((p1 = strchr (p0, ' '))) + { + memcpy (term_sub, p0, p1-p0); + term_sub[p1-p0] = '\0'; + } + else + strcpy (term_sub, p0); + logf (LOG_DEBUG, "dict_lookup: %s", term_sub); + if ((info = dict_lookup (zi->wordDict, term_sub))) + { + logf (LOG_DEBUG, " found"); + assert (*info == sizeof(*isam_p)); + memcpy (isam_p + isam_p_indx, info+1, sizeof(*isam_p)); + isam_p_indx++; + } + if (!p1) + break; + p0 = p1+1; + } + *isam_ps = isam_p; + *no = isam_p_indx; + logf (LOG_DEBUG, "%d positions", *no); + return 1; +} + +static RSET rpn_search_APT_relevance (ZServerInfo *zi, + Z_AttributesPlusTerm *zapt) +{ + rset_relevance_parms parms; + + parms.key_size = sizeof(struct it_key); + parms.max_rec = 100; + parms.cmp = key_compare; + parms.is = zi->wordIsam; + split_term (zi, zapt->term, &parms.isam_positions, + &parms.no_isam_positions); + if (parms.no_isam_positions > 0) + return rset_create (rset_kind_relevance, &parms); + else + return rset_create (rset_kind_null, NULL); +} static RSET rpn_search_APT (ZServerInfo *zi, Z_AttributesPlusTerm *zapt) { +#if 0 + Z_Term *term = zapt->term; char termz[IT_MAX_WORD+1]; size_t sizez; struct rset_isam_parms parms; const char *info; int i; - Z_Term *term = zapt->term; if (term->which != Z_Term_general) return NULL; @@ -70,6 +139,9 @@ static RSET rpn_search_APT (ZServerInfo *zi, Z_AttributesPlusTerm *zapt) parms.is = zi->wordIsam; logf (LOG_DEBUG, "rset_create isam"); return rset_create (rset_kind_isam, &parms); +#else + return rpn_search_APT_relevance (zi, zapt); +#endif } static RSET rpn_search_ref (ZServerInfo *zi, Z_ResultSetId *resultSetId) diff --git a/rset/rsrel.c b/rset/rsrel.c index 4faed5b..f52cc26 100644 --- a/rset/rsrel.c +++ b/rset/rsrel.c @@ -4,12 +4,17 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: rsrel.c,v $ - * Revision 1.1 1995-09-08 14:52:42 adam + * Revision 1.2 1995-09-11 13:09:41 adam + * More work on relevance feedback. + * + * Revision 1.1 1995/09/08 14:52:42 adam * Work on relevance feedback. * */ #include +#include +#include #include #include @@ -60,26 +65,90 @@ struct rset_rel_rfd { static void relevance (struct rset_rel_info *info, rset_relevance_parms *parms) { - char *isam_buf; + char **isam_buf; + char *isam_tmp_buf; int *isam_r; + int *max_tf; ISPT *isam_pt; + double *wgt; int i; - isam_buf = xmalloc (info->key_size * parms->no_isam_positions); + logf (LOG_DEBUG, "relevance"); + isam_buf = xmalloc (parms->no_isam_positions * sizeof(*isam_buf)); isam_r = xmalloc (sizeof (*isam_r) * parms->no_isam_positions); isam_pt = xmalloc (sizeof (*isam_pt) * parms->no_isam_positions); + isam_tmp_buf = xmalloc (info->key_size); + max_tf = xmalloc (sizeof (*max_tf) * parms->no_isam_positions); + wgt = xmalloc (sizeof (*wgt) * parms->no_isam_positions); for (i = 0; ino_isam_positions; i++) { + isam_buf[i] = xmalloc (info->key_size); isam_pt[i] = is_position (parms->is, parms->isam_positions[i]); - isam_r[i] = is_readkey (isam_pt[i], isam_buf + i*info->key_size); + max_tf [i] = is_numkeys (isam_pt[i]); + isam_r[i] = is_readkey (isam_pt[i], isam_buf[i]); + logf (LOG_DEBUG, "max tf %d = %d", i, max_tf[i]); } + while (1) + { + int min = -1, i; + double length, similarity; + + /* find min with lowest sysno */ + for (i = 0; ino_isam_positions; i++) + if (isam_r[i] && + (min < 0 || (*parms->cmp)(isam_buf[i], isam_buf[min]) < 1)) + min = i; + if (min < 0) + break; + memcpy (isam_tmp_buf, isam_buf[min], info->key_size); + logf (LOG_LOG, "calc rel for"); + key_logdump (LOG_LOG, isam_tmp_buf); + /* calculate for all with those sysno */ + length = 0.0; + for (i = 0; ino_isam_positions; i++) + { + int r; + if (isam_r[i]) + r = (*parms->cmp)(isam_buf[i], isam_tmp_buf); + else + r = 2; + if (r > 1 || r < -1) + wgt[i] = 0.0; + else + { + int tf = 0; + do + { + tf++; + isam_r[i] = is_readkey (isam_pt[i], isam_buf[i]); + } while (isam_r[i] && + (*parms->cmp)(isam_buf[i], isam_tmp_buf) <= 1); + logf (LOG_DEBUG, "tf%d = %d", i, tf); + wgt[i] = 0.5+tf*0.5/max_tf[i]; + length += wgt[i] * wgt[i]; + } + } + /* calculate relevance value */ + length = sqrt (length); + similarity = 0.0; + for (i = 0; ino_isam_positions; i++) + similarity += wgt[i]/length; + logf (LOG_LOG, " %f", similarity); + /* if value is in the top score, then save it - don't emit yet */ + } for (i = 0; ino_isam_positions; i++) + { is_pt_free (isam_pt[i]); + xfree (isam_buf[i]); + } + xfree (max_tf); + xfree (isam_tmp_buf); xfree (isam_buf); xfree (isam_r); xfree (isam_pt); + xfree (wgt); } static rset_control *r_create (const struct rset_control *sel, void *parms)