1 /* $Id: zvrank.c,v 1.11 2004-10-26 15:32:11 heikki Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 Zvrank: an experimental ranking algorithm. See doc/zvrank.txt and
25 source in index/zvrank.c. Enable this by using rank: zvrank in zebra.cfg.
26 Contributed by Johannes Leveling <Johannes.Leveling at
30 /* Zebra Vector Space Model RANKing
32 ** six (seven) letter identifier for weighting scheme
33 ** best document weighting:
34 ** tfc nfc (tpc npc) [original naming]
35 ** ntc atc npc apc [SMART naming, used here]
36 ** best query weighting:
37 ** nfx tfx bfx (npx tpx bpx) [original naming]
38 ** atn ntn btn apn npn bpn [SMART naming]
39 ** -> should set zvrank.weighting-scheme to one of
40 ** "ntc-atn", "atc-atn", etc.
43 #if SKIPTHIS /* FIXME - Disabled while changing the interface to ranking */
45 #include <math.h> /* for log */
57 static double blog(double x) {
58 /* log_2, log_e or log_10 is used, best to change it here if necessary */
61 return log(x); /* / log(base) */
66 struct rank_class_info {
67 char rscheme[8]; /* name of weighting scheme */
71 struct rs_info { /* for result set */
72 int db_docs; /* number of documents in database (collection) */
73 int db_terms; /* number of distinct terms in database (debugging?) */
74 int db_f_max; /* maximum of f_t in database (debugging?) */
75 char *db_f_max_str; /* string (most frequent term) - for debugging */
77 char rscheme[8]; /* name of weighting scheme */
81 void (*d_tf_fct)(void *, void *); /* doc term frequency function */
82 void (*d_idf_fct)(void *, void *); /* doc idf function */
83 void (*d_norm_fct)(void *, void *); /* doc normalization function */
85 void (*q_tf_fct)(void *, void *); /* query term frequency function */
86 void (*q_idf_fct)(void *, void *); /* query idf function */
87 void (*q_norm_fct)(void *, void *); /* query normalization function */
89 double (*sim_fct)(void *, void *); /* similarity function (scoring function) */
93 typedef struct rs_info *RS;
95 static void prn_rs(RS rs) { /* for debugging */
96 yaz_log(LOG_DEBUG, "* RS:");
97 yaz_log(LOG_DEBUG, " db_docs: %d", rs->db_docs);
98 yaz_log(LOG_DEBUG, " db_terms: %d", rs->db_terms);
99 yaz_log(LOG_DEBUG, " f_max: %d", rs->db_f_max);
100 yaz_log(LOG_DEBUG, " f_max_str: %s", rs->db_f_max_str);
101 yaz_log(LOG_DEBUG, " veclen: %d", rs->veclen);
102 /* rscheme implies functions */
103 yaz_log(LOG_DEBUG, " rscheme: %s", rs->rscheme);
107 struct ds_info { /* document info */
108 char *docid; /* unique doc identifier */
109 int docno; /* doc number */
110 int doclen; /* document length */
111 int d_f_max; /* maximum number of any term in doc (needed) */
112 char *d_f_max_str; /* most frequent term in d - for debugging */
113 int veclen; /* vector length */
114 struct ts_info *terms;
115 double docsim; /* similarity in [0, ..., 1] (= score/1000) */
117 typedef struct ds_info* DS;
120 static void prn_ds(DS ds) { /* for debugging */
121 yaz_log(LOG_DEBUG, " * DS:");
122 yaz_log(LOG_DEBUG, " docid: %s", ds->docid);
123 yaz_log(LOG_DEBUG, " docno: %d", ds->docno);
124 yaz_log(LOG_DEBUG, " doclen: %d", ds->doclen);
125 yaz_log(LOG_DEBUG, " d_f_max: %d", ds->d_f_max);
126 yaz_log(LOG_DEBUG, " d_f_max_str:%s", ds->d_f_max_str);
127 yaz_log(LOG_DEBUG, " veclen: %d", ds->veclen);
132 struct ts_info { /* term info */
142 typedef struct ts_info *TS;
145 static void prn_ts(TS ts) { /* for debugging */
146 yaz_log(LOG_DEBUG, " * TERM:%s gocc:%d locc:%d tf:%f idf:%f wt:%f",
147 ts->name, ts->gocc, ts->locc, ts->tf, ts->idf, ts->wt);
157 ** weighting functions
158 ** check: RS is not needed anymore
161 /* calculate and store new term frequency vector */
162 static void tf_none(void *rsi, void *dsi) {
165 /* no conversion. 1 <= tf */
167 for (i=0; i < veclen; i++) {
168 freq=ds->terms[i].locc;
169 ds->terms[i].tf=freq;
174 static void tf_binary(void *rsi, void *dsi) {
179 for (i=0; i < veclen; i++) {
180 freq=ds->terms[i].locc;
189 static void tf_max_norm(void *rsi, void *dsi) {
193 /* divide each term by max, so 0 <= tf <= 1 */
194 tf_max=ds->d_f_max; /* largest frequency of t in document */
196 for (i=0; i < veclen; i++) {
197 freq=ds->terms[i].locc;
200 ds->terms[i].tf=freq/tf_max;
207 static void tf_aug_norm(void *rsi, void *dsi) {
212 /* augmented normalized tf. 0.5 <= tf <= 1 for K = 0.5 */
213 tf_max=ds->d_f_max; /* largest frequency of t in document */
215 K=0.5; /* zvrank.const-K */
216 for (i=0; i < veclen; i++) {
217 freq=ds->terms[i].locc;
220 ds->terms[i].tf=K+(1.0-K)*(freq/tf_max);
227 static void tf_square(void *rsi, void *dsi) {
232 for (i=0; i < veclen; i++) {
233 freq=ds->terms[i].locc;
235 ds->terms[i].tf=freq*freq;
242 static void tf_log(void *rsi, void *dsi) {
247 for (i=0; i < veclen; i++) {
248 freq=ds->terms[i].locc;
250 ds->terms[i].tf=1.0+blog(freq);
257 /* calculate and store inverse document frequency vector */
258 static void idf_none(void *rsi, void *dsi) {
263 for (i=0; i < veclen; i++) {
264 ds->terms[i].idf=1.0;
269 static void idf_tfidf(void *rsi, void *dsi) {
275 /* normal tfidf weight */
277 num_docs=rs->db_docs;
278 for (i=0; i < veclen; i++) {
279 gocc=ds->terms[i].gocc;
283 idf=blog((double) (num_docs/gocc));
284 ds->terms[i].idf=idf;
289 static void idf_prob(void *rsi, void *dsi) {
295 /* probabilistic formulation */
297 num_docs=rs->db_docs;
298 for (i=0; i < veclen; i++) {
299 gocc=ds->terms[i].gocc;
303 idf=blog((double) ((num_docs-gocc)/gocc));
304 ds->terms[i].idf=idf;
309 static void idf_freq(void *rsi, void *dsi) {
315 /* frequency formulation */
317 num_docs=rs->db_docs;
322 for (i=0; i < veclen; i++) {
323 ds->terms[i].idf=idf;
328 static void idf_squared(void *rsi, void *dsi) {
336 num_docs=rs->db_docs;
337 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
338 for (i=0; i < veclen; i++) {
339 gocc=ds->terms[i].gocc;
343 idf=blog(num_docs/gocc);
345 ds->terms[i].idf=idf;
350 /* calculate and store normalized weight (tf-idf) vector */
351 static void norm_none(void *rsi, void *dsi) {
354 /* no normalization */
356 for (i=0; i < veclen; i++) {
357 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
362 static void norm_sum(void *rsi, void *dsi) {
368 for (i=0; i < veclen; i++) {
369 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
370 tfs+=ds->terms[i].wt;
373 for (i=0; i < veclen; i++) {
374 ds->terms[i].wt=ds->terms[i].wt/tfs;
376 /* else: tfs==0 && ds->terms[i].wt==0 */
380 static void norm_cosine(void *rsi, void *dsi) {
386 for (i=0; i < veclen; i++) {
387 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
388 tfs+=(ds->terms[i].wt*ds->terms[i].wt);
392 for (i=0; i < veclen; i++) {
393 ds->terms[i].wt=ds->terms[i].wt/tfs;
395 /* else: tfs==0 && ds->terms[i].wt==0 */
399 static void norm_fourth(void *rsi, void *dsi) {
405 for (i=0; i < veclen; i++) {
406 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
407 fr=(ds->terms[i].wt*ds->terms[i].wt);
412 for (i=0; i < veclen; i++) {
413 ds->terms[i].wt=ds->terms[i].wt/tfs;
415 /* else: tfs==0 && ds->terms[i].wt==0 */
419 static void norm_max(void *rsi, void *dsi) {
425 for (i=0; i < veclen; i++) {
426 ds->terms[i].wt=ds->terms[i].tf*ds->terms[i].idf;
427 if (ds->terms[i].wt > tfm)
431 for (i=0; i < veclen; i++) {
432 ds->terms[i].wt=ds->terms[i].wt/tfm;
434 /* else: tfs==0 && ds->terms[i].wt==0 */
438 /* add: norm_pivot, ... */
440 static double sim_cosine(void *dsi1, void *dsi2) {
444 double smul=0.0, sdiv=0.0, sqr11=0.0, sqr22=0.0;
447 veclen=ds1->veclen; /* and ds2->veclen */
448 for (i=0; i < veclen; i++) {
455 sdiv=sqrt(sqr11*sqr22);
461 /* add: norm_jaccard, norm_dice, ... */
463 /* end weighting functions */
467 static void zv_init_scheme(RS rs, const char *sname) {
469 char c0, c1, c2, c3, c4, c5, c6;
470 const char *def_rscheme="ntc-atn"; /* a good default */
472 yaz_log(LOG_DEBUG, "zv_init_scheme");
475 yaz_log(LOG_LOG, "zvrank: invalid weighting-scheme \"%s\"", sname);
476 if (slen > 0) c0=sname[0]; else c0=def_rscheme[0];
477 if (slen > 1) c1=sname[1]; else c1=def_rscheme[1];
478 if (slen > 2) c2=sname[2]; else c2=def_rscheme[2];
480 if (slen > 4) c4=sname[4]; else c4=def_rscheme[4];
481 if (slen > 5) c5=sname[5]; else c5=def_rscheme[5];
482 if (slen > 6) c6=sname[6]; else c6=def_rscheme[6];
484 /* assign doc functions */
487 rs->d_tf_fct=tf_binary;
491 rs->d_tf_fct=tf_max_norm;
493 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
496 rs->d_tf_fct=tf_aug_norm;
498 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
501 rs->d_tf_fct=tf_square;
509 rs->d_tf_fct=tf_none;
514 rs->d_idf_fct=idf_tfidf;
516 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
519 rs->d_idf_fct=idf_prob;
521 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
524 rs->d_idf_fct=idf_freq;
526 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
529 rs->d_idf_fct=idf_squared;
531 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
534 rs->d_idf_fct=idf_none;
539 rs->d_norm_fct=norm_sum;
543 rs->d_norm_fct=norm_cosine;
547 rs->d_norm_fct=norm_fourth;
551 rs->d_norm_fct=norm_max;
555 rs->d_norm_fct=norm_none;
560 /* assign query functions */
563 rs->q_tf_fct=tf_binary;
567 rs->q_tf_fct=tf_max_norm;
568 yaz_log(LOG_DEBUG, "tf_max_norm: d_f_max required");
572 rs->q_tf_fct=tf_aug_norm;
574 yaz_log(LOG_DEBUG, "tf_aug_norm: d_f_max required");
577 rs->q_tf_fct=tf_square;
585 rs->q_tf_fct=tf_none;
590 rs->q_idf_fct=idf_tfidf;
592 yaz_log(LOG_DEBUG, "idf_tfidf: db_docs required");
595 rs->q_idf_fct=idf_prob;
597 yaz_log(LOG_DEBUG, "idf_prob: db_docs required");
600 rs->q_idf_fct=idf_freq;
602 yaz_log(LOG_DEBUG, "idf_freq: db_docs required");
605 rs->q_idf_fct=idf_squared;
607 yaz_log(LOG_DEBUG, "idf_squared: db_docs required");
610 rs->q_idf_fct=idf_none;
615 rs->q_norm_fct=norm_sum;
619 rs->q_norm_fct=norm_cosine;
623 rs->q_norm_fct=norm_fourth;
627 rs->q_norm_fct=norm_max;
631 rs->q_norm_fct=norm_none;
636 rs->sim_fct=sim_cosine;
637 yaz_log(LOG_DEBUG, "zv_scheme %s", rs->rscheme);
641 static void zv_init(RS rs, const char *rscheme) {
642 yaz_log(LOG_DEBUG, "zv_init");
644 rs->db_docs=100000; /* assign correct value here */
645 rs->db_terms=500000; /* assign correct value here (for debugging) */
646 rs->db_f_max=50; /* assign correct value here */
647 rs->db_f_max_str="a"; /* assign correct value here (for debugging) */
648 zv_init_scheme(rs, rscheme);
655 * zv_create: Creates/Initialises this rank handler. This routine is
656 * called exactly once. The routine returns the class_handle.
658 static void *zv_create (ZebraHandle zh) {
662 struct rank_class_info *ci = (struct rank_class_info *)
663 xmalloc (sizeof(*ci));
664 yaz_log(LOG_DEBUG, "zv_create");
665 wscheme=res_get_def(res, "zvrank.weighting-scheme", "");
666 for (i=0; wscheme[i] && i < 8; i++)
667 ci->rscheme[i]=wscheme[i];
668 ci->rscheme[i] = '\0';
673 * zv_destroy: Destroys this rank handler. This routine is called
674 * when the handler is no longer needed - i.e. when the server
675 * dies. The class_handle was previously returned by create.
677 static void zv_destroy (struct zebra_register *reg, void *class_handle) {
678 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
679 yaz_log(LOG_DEBUG, "zv_destroy");
685 * zv_begin: Prepares beginning of "real" ranking. Called once for
686 * each result set. The returned handle is a "set handle" and
687 * will be used in each of the handlers below.
689 static void *zv_begin(struct zebra_register *reg, void *class_handle,
690 RSET rset, NMEM nmem)
692 struct rs_info *rs=(struct rs_info *)xmalloc(sizeof(*rs));
693 struct rank_class_info *ci=(struct rank_class_info *)class_handle;
698 yaz_log(LOG_DEBUG, "zv_begin");
699 veclen= 0 ; /* rset->no_rset_terms;*/ /* smaller vector here */
700 /* FIXME - Now that we don't have term lists in rsets, what do */
702 zv_init(rs, ci->rscheme);
707 rs->qdoc=(struct ds_info *)xmalloc(sizeof(*rs->qdoc));
708 rs->qdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->qdoc->terms)*rs->veclen);
709 rs->qdoc->veclen=veclen;
710 rs->qdoc->d_f_max=1; /* no duplicates */
711 rs->qdoc->d_f_max_str="";
713 rs->rdoc=(struct ds_info *)xmalloc(sizeof(*rs->rdoc));
714 rs->rdoc->terms=(struct ts_info *)xmalloc(sizeof(*rs->rdoc->terms)*rs->veclen);
715 rs->rdoc->veclen=veclen;
716 rs->rdoc->d_f_max=10; /* just a guess */
717 rs->rdoc->d_f_max_str="";
718 /* yaz_log(LOG_DEBUG, "zv_begin_init"); */
719 for (i = 0; i < rs->veclen; i++)
721 gocc= 0; /* rset->rset_terms[i]->nn; */ /* FIXME ??? */
722 /* yaz_log(LOG_DEBUG, "zv_begin_init i=%d gocc=%d", i, gocc); */
723 rs->qdoc->terms[i].gocc=gocc;
724 rs->qdoc->terms[i].locc=1; /* assume query has no duplicate terms */
725 rs->rdoc->terms[i].gocc=gocc;
726 rs->rdoc->terms[i].locc=0;
728 (*rs->q_tf_fct)(rs, rs->qdoc); /* we do this once only */
729 (*rs->q_idf_fct)(rs, rs->qdoc);
730 (*rs->q_norm_fct)(rs, rs->qdoc);
735 * zv_end: Terminates ranking process. Called after a result set
738 static void zv_end (struct zebra_register *reg, void *rsi)
741 yaz_log(LOG_DEBUG, "zv_end");
742 xfree(rs->qdoc->terms);
743 xfree(rs->rdoc->terms);
751 * zv_add: Called for each word occurence in a result set. This routine
752 * should be as fast as possible. This routine should "incrementally"
755 static void zv_add (void *rsi, int seqno, int i) {
757 /* yaz_log(LOG_DEBUG, "zvrank zv_add seqno=%d term_index=%d", seqno, term_index);*/
758 rs->rdoc->terms[i].locc++;
762 * zv_calc: Called for each document in a result. This handler should
763 * produce a score based on previous call(s) to the add handler. The
764 * score should be between 0 and 1000. If score cannot be obtained
765 * -1 should be returned.
767 static int zv_calc (void *rsi, zint sysno)
773 /* yaz_log(LOG_DEBUG, "zv_calc"); */
778 for (i = 0; i < veclen; i++) {
779 /* qdoc weight has already been calculated */
780 (*rs->d_tf_fct)(rs, rs->rdoc);
781 (*rs->d_idf_fct)(rs, rs->rdoc);
782 (*rs->d_norm_fct)(rs, rs->rdoc);
783 dscore=rs->sim_fct(rs->qdoc, rs->rdoc);
785 score = (int) dscore * 1000;
786 yaz_log (LOG_LOG, "sysno=" ZINT_FORMAT " score=%d", sysno, score);
787 if (score > 1000) /* should not happen */
793 * Pseudo-meta code with sequence of calls as they occur in a
794 * server. Handlers are prefixed by --:
810 static struct rank_control rank_control_vsm = {
820 struct rank_control *rankzv_class = &rank_control_vsm;
822 #endif /* SKIPTHIS */