-/* $Id: ranksimilarity.c,v 1.3 2006-05-04 10:11:09 marc Exp $
- Copyright (C) 1995-2005
- Index Data ApS
-
-This file is part of the Zebra server.
+/* This file is part of the Zebra server.
+ Copyright (C) 1994-2010 Index Data
Zebra is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
for more details.
You should have received a copy of the GNU General Public License
-along with Zebra; see the file LICENSE.zebra. If not, write to the
-Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-02111-1307, USA.
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
*/
#include <stdio.h>
/** number of docs within result set */
zint no_docs_resset;
- /** number of terms in this field */
- zint no_terms_field;
+ /** number of docs with this fieldindex in database */
+ zint no_docs_fieldindex;
- /** number of docs with this field in database*/
- zint no_docs_field;
+ /** number of terms in this fieldindex */
+ zint no_terms_fieldindex;
/** rank flag is one if term is to be included in ranking */
int rank_flag;
- /** relative ranking weight of term */
- int term_weight;
+ /** relative ranking weight of term fieldindex */
+ int fieldindex_weight;
/** term id used to access term name and other info */
TERMID term;
/** number of terms in query which are included in ranking */
int no_ranked_terms_query;
- /** number of documents in entire collection */
+ /** number of documents in entire database */
zint no_docs_database;
+ /** number of terms in entire database */
+ zint no_terms_database;
+
/** array of size no_terms_query with statistics gathered per term */
struct ranksimilarity_term_info *entries;
/* setting database global statistics */
si->no_docs_database = -1; /* TODO */
+ si->no_terms_database = -1; /* TODO */
/* setting query statistics */
si->no_terms_query = numterms;
ranksimilar_rec_reset(si);
- /* looping all terms in a specific field of query */
+ /* looping all terms in a specific fieldindex of query */
for (i = 0; i < numterms; i++)
{
struct ord_list *ol = NULL;
{
const char *cp = strstr(terms[i]->flags+4, ",w=");
+ zint no_docs_fieldindex = 0;
+ zint no_terms_fieldindex = 0;
+
+ yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
+ i, terms[i]->name, terms[i]->flags);
+
(si->no_ranked_terms_query)++;
ol = terms[i]->ol;
- si->entries[i].rank_flag = 1;
- /* notice that the call to rset_count(rset) has he side-effect of setting
- rset->hits_limit = rset_count(rset) ??? */
+ si->entries[i].rank_flag = 1;
+ /* notice that the call to rset_count(rset) has he side-effect
+ of setting rset->hits_limit = rset_count(rset) ??? */
si->entries[i].freq_term_resset = rset_count(terms[i]->rset);
- /* si->entries[i].freq_term_resset = terms[i]->rset->hits_count; */
+ si->entries[i].no_docs_resset = terms[i]->rset->hits_count;
-
- yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = %d",
- i, rset_count(terms[i]->rset));
- yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = %d",
- i, terms[i]->rset->hits_limit);
- yaz_log(log_level, "begin() terms[%d]->rset->hits_count = %d",
- i, terms[i]->rset->hits_count);
- yaz_log(log_level, "begin() terms[%d]->rset->hits_round = %d",
- i, terms[i]->rset->hits_round);
- yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d",
- i, terms[i]->rset->hits_approx);
-
- si->entries[i].no_docs_resset = -1; /*TODO*/
- si->entries[i].no_docs_field = -1; /*TODO*/
- si->entries[i].no_terms_field = -1; /*TODO*/
-
- if (cp)
- si->entries[i].term_weight = atoi (cp+3);
+ if (cp)
+ si->entries[i].fieldindex_weight = atoi (cp+3);
else
- si->entries[i].term_weight = 34; /* sqrroot of 1000 */
-
- yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
- i, terms[i]->name, terms[i]->flags);
+ si->entries[i].fieldindex_weight = 34; /* sqrroot of 1000 */
+
+
+ /*
+ yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = "
+ ZINT_FORMAT, i, rset_count(terms[i]->rset));
+ yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = "
+ ZINT_FORMAT, i, terms[i]->rset->hits_limit);
+ yaz_log(log_level, "begin() terms[%d]->rset->hits_count = "
+ ZINT_FORMAT, i, terms[i]->rset->hits_count);
+ yaz_log(log_level, "begin() terms[%d]->rset->hits_round = "
+ ZINT_FORMAT, i, terms[i]->rset->hits_round);
+ yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d",
+ i, terms[i]->rset->hits_approx);
+ */
/* looping indexes where term terms[i] is found */
- for (; ol; ol = ol->next)
+
+ for (; ol; ol = ol->next)
{
- int index_type = 0;
+ const char *index_type = 0;
const char *db = 0;
const char *string_index = 0;
- int set = -1;
- int use = -1;
-
+
zebraExplain_lookup_ord(reg->zei,
- ol->ord, &index_type, &db, &set, &use,
+ ol->ord, &index_type, &db,
&string_index);
+ no_docs_fieldindex
+ += zebraExplain_ord_get_doc_occurrences(reg->zei, ol->ord);
+ no_terms_fieldindex
+ += zebraExplain_ord_get_term_occurrences(reg->zei, ol->ord);
+
if (string_index)
- yaz_log(log_level,
- "begin() index: ord=%d type=%c db=%s str-index=%s",
- ol->ord, index_type, db, string_index);
+ yaz_log(log_level,
+ "begin() index: ord=%d type=%s db=%s str-index=%s",
+ ol->ord, index_type, db, string_index);
else
- yaz_log(log_level,
- "begin() index: ord=%d type=%c db=%s set=%d use=%d",
- ol->ord, index_type, db, set, use);
+ yaz_log(log_level,
+ "begin() index: ord=%d type=%s db=%s",
+ ol->ord, index_type, db);
}
+ si->entries[i].no_docs_fieldindex = no_docs_fieldindex;
+ si->entries[i].no_terms_fieldindex = no_terms_fieldindex;
}
si->entries[i].term = terms[i];
}
+
/**
* add: Called for each word occurence in a result set. This routine
* should be as fast as possible. This routine should "incrementally"
assert(ti);
si->last_pos = seqno;
ti->freq_term_docfield++;
- /* yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d",
- seqno, term->name, ti->freq_term_docfield); */
+ /*yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d",
+ seqno, term->name, ti->freq_term_docfield); */
}
/*
= (struct ranksimilarity_set_info *) set_handle;
- yaz_log(log_level, "calc() sysno = %d", sysno);
- yaz_log(log_level, "calc() staticrank = %d", staticrank);
+ yaz_log(log_level, "calc() sysno = " ZINT_FORMAT, sysno);
+ yaz_log(log_level, "calc() staticrank = " ZINT_FORMAT, staticrank);
yaz_log(log_level, "calc() si->no_terms_query = %d",
si->no_terms_query);
yaz_log(log_level, "calc() si->no_ranked_terms_query = %d",
si->no_ranked_terms_query);
- yaz_log(log_level, "calc() si->no_docs_database = %d",
+ yaz_log(log_level, "calc() si->no_docs_database = " ZINT_FORMAT,
si->no_docs_database);
+ yaz_log(log_level, "calc() si->no_terms_database = " ZINT_FORMAT,
+ si->no_terms_database);
if (!si->no_ranked_terms_query)
/* you may use all the gathered statistics here */
for (i = 0; i < si->no_terms_query; i++)
{
- yaz_log(log_level, "calc() entries[%d] termid %d",
+ yaz_log(log_level, "calc() entries[%d] termid %p",
i, si->entries[i].term);
if (si->entries[i].term){
yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s",
i, si->entries[i].term->name, si->entries[i].term->flags);
yaz_log(log_level, "calc() entries[%d] rank_flag %d",
i, si->entries[i].rank_flag );
- yaz_log(log_level, "calc() entries[%d] term_weight %d",
- i, si->entries[i].term_weight );
+ yaz_log(log_level, "calc() entries[%d] fieldindex_weight %d",
+ i, si->entries[i].fieldindex_weight );
yaz_log(log_level, "calc() entries[%d] freq_term_docfield %d",
i, si->entries[i].freq_term_docfield );
- yaz_log(log_level, "calc() entries[%d] freq_term_resset %d",
+ yaz_log(log_level, "calc() entries[%d] freq_term_resset " ZINT_FORMAT,
i, si->entries[i].freq_term_resset );
- yaz_log(log_level, "calc() entries[%d] no_docs_resset %d",
+ yaz_log(log_level, "calc() entries[%d] no_docs_resset " ZINT_FORMAT,
i, si->entries[i].no_docs_resset );
- yaz_log(log_level, "calc() entries[%d] no_docs_field %d",
- i, si->entries[i].no_docs_field );
- yaz_log(log_level, "calc() entries[%d] no_terms_field %d",
- i, si->entries[i].no_terms_field );
+ yaz_log(log_level, "calc() entries[%d] no_docs_fieldindex "
+ ZINT_FORMAT,
+ i, si->entries[i].no_docs_fieldindex );
+ yaz_log(log_level, "calc() entries[%d] no_terms_fieldindex "
+ ZINT_FORMAT,
+ i, si->entries[i].no_terms_fieldindex );
}
}
/* staticrank = 0 is highest, MAXINT lowest */
- score = INT_MAX - staticrank; /* but score is reverse (logical) */
+ if (staticrank >= INT_MAX)
+ score = 0;
+ else
+ { /* but score is reverse (logical) */
+ score = INT_MAX - CAST_ZINT_TO_INT(staticrank);
+ }
/* debugging statistics output */
};
struct rank_control *rank_similarity_class = &rank_control;
+/*
+ * Local variables:
+ * c-basic-offset: 4
+ * c-file-style: "Stroustrup"
+ * indent-tabs-mode: nil
+ * End:
+ * vim: shiftwidth=4 tabstop=8 expandtab
+ */
+