1 /* This file is part of the Zebra server.
2 Copyright (C) 1994-2010 Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
33 static int log_level = 0;
34 static int log_initialized = 0;
36 struct ranksimilarity_class_info {
40 /** term specific info and statistics to be used under ranking */
41 struct ranksimilarity_term_info {
43 /** frequency of term within document field */
44 int freq_term_docfield;
46 /** frequency of term within result set of given term */
47 zint freq_term_resset;
49 /** number of docs within result set */
52 /** number of docs with this fieldindex in database */
53 zint no_docs_fieldindex;
55 /** number of terms in this fieldindex */
56 zint no_terms_fieldindex;
58 /** rank flag is one if term is to be included in ranking */
61 /** relative ranking weight of term fieldindex */
62 int fieldindex_weight;
64 /** term id used to access term name and other info */
67 /** index number in terms[i] array */
71 struct ranksimilarity_set_info {
74 /** number of terms in query */
77 /** number of terms in query which are included in ranking */
78 int no_ranked_terms_query;
80 /** number of documents in entire database */
81 zint no_docs_database;
83 /** number of terms in entire database */
84 zint no_terms_database;
86 /** array of size no_terms_query with statistics gathered per term */
87 struct ranksimilarity_term_info *entries;
93 /* local clean-up function */
94 static void ranksimilar_rec_reset(struct ranksimilarity_set_info *si)
98 for (i = 0; i < si->no_terms_query; i++)
100 si->entries[i].freq_term_docfield = 0;
106 * create: Creates/Initialises this rank handler. This routine is
107 * called exactly once. The routine returns the class_handle.
109 static void *create (ZebraHandle zh)
111 struct ranksimilarity_class_info *ci =
112 (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci));
114 if (!log_initialized)
116 log_level = yaz_log_module_level("rank-similarity");
119 yaz_log(log_level, "create()");
124 * destroy: Destroys this rank handler. This routine is called
125 * when the handler is no longer needed - i.e. when the server
126 * dies. The class_handle was previously returned by create.
128 static void destroy (struct zebra_register *reg, void *class_handle)
130 struct ranksimilarity_class_info *ci
131 = (struct ranksimilarity_class_info *) class_handle;
132 yaz_log(log_level, "destroy()");
138 * begin: Prepares beginning of "real" ranking. Called once for
139 * each result set. The returned handle is a "set handle" and
140 * will be used in each of the handlers below.
142 static void *begin (struct zebra_register *reg,
143 void *class_handle, RSET rset, NMEM nmem,
144 TERMID *terms, int numterms)
146 struct ranksimilarity_set_info *si =
147 (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si));
150 yaz_log(log_level, "begin() numterms=%d", numterms);
152 /* setting database global statistics */
153 si->no_docs_database = -1; /* TODO */
154 si->no_terms_database = -1; /* TODO */
156 /* setting query statistics */
157 si->no_terms_query = numterms;
158 si->no_ranked_terms_query = 0;
160 /* setting internal data structures */
162 si->entries = (struct ranksimilarity_term_info *)
163 nmem_malloc (si->nmem, sizeof(*si->entries)*numterms);
165 /* reset the counts for the next term */
166 ranksimilar_rec_reset(si);
169 /* looping all terms in a specific fieldindex of query */
170 for (i = 0; i < numterms; i++)
172 struct ord_list *ol = NULL;
175 /* adding to number of rank entries */
176 if (strncmp (terms[i]->flags, "rank,", 5))
178 si->entries[i].rank_flag = 0;
179 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s not ranked",
180 i, terms[i]->name, terms[i]->flags);
184 const char *cp = strstr(terms[i]->flags+4, ",w=");
186 zint no_docs_fieldindex = 0;
187 zint no_terms_fieldindex = 0;
189 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
190 i, terms[i]->name, terms[i]->flags);
192 (si->no_ranked_terms_query)++;
195 si->entries[i].rank_flag = 1;
196 /* notice that the call to rset_count(rset) has he side-effect
197 of setting rset->hits_limit = rset_count(rset) ??? */
198 si->entries[i].freq_term_resset = rset_count(terms[i]->rset);
199 si->entries[i].no_docs_resset = terms[i]->rset->hits_count;
203 si->entries[i].fieldindex_weight = atoi (cp+3);
205 si->entries[i].fieldindex_weight = 34; /* sqrroot of 1000 */
209 yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = "
210 ZINT_FORMAT, i, rset_count(terms[i]->rset));
211 yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = "
212 ZINT_FORMAT, i, terms[i]->rset->hits_limit);
213 yaz_log(log_level, "begin() terms[%d]->rset->hits_count = "
214 ZINT_FORMAT, i, terms[i]->rset->hits_count);
215 yaz_log(log_level, "begin() terms[%d]->rset->hits_round = "
216 ZINT_FORMAT, i, terms[i]->rset->hits_round);
217 yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d",
218 i, terms[i]->rset->hits_approx);
221 /* looping indexes where term terms[i] is found */
223 for (; ol; ol = ol->next)
225 const char *index_type = 0;
227 const char *string_index = 0;
229 zebraExplain_lookup_ord(reg->zei,
230 ol->ord, &index_type, &db,
234 += zebraExplain_ord_get_doc_occurrences(reg->zei, ol->ord);
236 += zebraExplain_ord_get_term_occurrences(reg->zei, ol->ord);
240 "begin() index: ord=%d type=%s db=%s str-index=%s",
241 ol->ord, index_type, db, string_index);
244 "begin() index: ord=%d type=%s db=%s",
245 ol->ord, index_type, db);
248 si->entries[i].no_docs_fieldindex = no_docs_fieldindex;
249 si->entries[i].no_terms_fieldindex = no_terms_fieldindex;
252 si->entries[i].term = terms[i];
253 si->entries[i].term_index=i;
255 /* setting next entry in term */
256 terms[i]->rankpriv = &(si->entries[i]);
263 * end: Terminates ranking process. Called after a result set
266 static void end (struct zebra_register *reg, void *set_handle)
268 yaz_log(log_level, "end()");
274 * add: Called for each word occurence in a result set. This routine
275 * should be as fast as possible. This routine should "incrementally"
278 static void add (void *set_handle, int seqno, TERMID term)
280 struct ranksimilarity_set_info *si
281 = (struct ranksimilarity_set_info *) set_handle;
282 struct ranksimilarity_term_info *ti;
286 /* yaz_log(log_level, "add() seqno=%d NULL term", seqno); */
290 ti= (struct ranksimilarity_term_info *) term->rankpriv;
292 si->last_pos = seqno;
293 ti->freq_term_docfield++;
294 /*yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d",
295 seqno, term->name, ti->freq_term_docfield); */
299 * calc: Called for each document in a result. This handler should
300 * produce a score based on previous call(s) to the add handler. The
301 * score should be between 0 and 1000. If score cannot be obtained
302 * -1 should be returned.
304 static int calc (void *set_handle, zint sysno, zint staticrank,
308 struct ranksimilarity_set_info *si
309 = (struct ranksimilarity_set_info *) set_handle;
312 yaz_log(log_level, "calc() sysno = " ZINT_FORMAT, sysno);
313 yaz_log(log_level, "calc() staticrank = " ZINT_FORMAT, staticrank);
315 yaz_log(log_level, "calc() si->no_terms_query = %d",
317 yaz_log(log_level, "calc() si->no_ranked_terms_query = %d",
318 si->no_ranked_terms_query);
319 yaz_log(log_level, "calc() si->no_docs_database = " ZINT_FORMAT,
320 si->no_docs_database);
321 yaz_log(log_level, "calc() si->no_terms_database = " ZINT_FORMAT,
322 si->no_terms_database);
325 if (!si->no_ranked_terms_query)
326 return -1; /* ranking not enabled for any terms */
329 /* if we set *stop_flag = 1, we stop processing (of result set list) */
332 /* here goes your formula to compute a scoring function */
333 /* you may use all the gathered statistics here */
334 for (i = 0; i < si->no_terms_query; i++)
336 yaz_log(log_level, "calc() entries[%d] termid %p",
337 i, si->entries[i].term);
338 if (si->entries[i].term){
339 yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s",
340 i, si->entries[i].term->name, si->entries[i].term->flags);
341 yaz_log(log_level, "calc() entries[%d] rank_flag %d",
342 i, si->entries[i].rank_flag );
343 yaz_log(log_level, "calc() entries[%d] fieldindex_weight %d",
344 i, si->entries[i].fieldindex_weight );
345 yaz_log(log_level, "calc() entries[%d] freq_term_docfield %d",
346 i, si->entries[i].freq_term_docfield );
347 yaz_log(log_level, "calc() entries[%d] freq_term_resset " ZINT_FORMAT,
348 i, si->entries[i].freq_term_resset );
349 yaz_log(log_level, "calc() entries[%d] no_docs_resset " ZINT_FORMAT,
350 i, si->entries[i].no_docs_resset );
351 yaz_log(log_level, "calc() entries[%d] no_docs_fieldindex "
353 i, si->entries[i].no_docs_fieldindex );
354 yaz_log(log_level, "calc() entries[%d] no_terms_fieldindex "
356 i, si->entries[i].no_terms_fieldindex );
361 /* reset the counts for the next term */
362 ranksimilar_rec_reset(si);
365 /* staticrank = 0 is highest, MAXINT lowest */
366 if (staticrank >= INT_MAX)
369 { /* but score is reverse (logical) */
370 score = INT_MAX - CAST_ZINT_TO_INT(staticrank);
374 /* debugging statistics output */
375 yaz_log(log_level, "calc() statistics: score = %d", score);
381 * Pseudo-meta code with sequence of calls as they occur in a
382 * server. Handlers are prefixed by --:
398 static struct rank_control rank_control = {
408 struct rank_control *rank_similarity_class = &rank_control;
412 * c-file-style: "Stroustrup"
413 * indent-tabs-mode: nil
415 * vim: shiftwidth=4 tabstop=8 expandtab