From 556f48eff358140d9b68549bf88988133fba786d Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 3 Nov 2009 15:03:41 +0100 Subject: [PATCH] Modify length calculation for ranking Use a different denominator (length) for per-field relevance scoring.. Instead of length of all ranked fields we now use length of individual fields (as if they were individual "free" text fields). This will ensure that documents with a long field with no match (say description) will not "hurt" a title match. --- src/logic.c | 3 ++- src/record.h | 2 ++ src/relevance.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++--------- src/relevance.h | 2 +- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/src/logic.c b/src/logic.c index d5c8f76..3afa67c 100644 --- a/src/logic.c +++ b/src/logic.c @@ -1252,7 +1252,8 @@ struct record *ingest_record(struct client *cl, const char *rec, // ranking of _all_ fields enabled ... if (ser_md->rank) relevance_countwords(se->relevance, cluster, - (char *) value, ser_md->rank); + (char *) value, ser_md->rank, + ser_md->name); // construct facets ... if (ser_md->termlist) diff --git a/src/record.h b/src/record.h index 872c36e..ac38ad3 100644 --- a/src/record.h +++ b/src/record.h @@ -114,6 +114,8 @@ struct record_cluster char *merge_key; int relevance; int *term_frequency_vec; + int *term_frequency_vec_tmp; + float *term_frequency_vecf; // Set-specific ID for this record char *recid; struct record *records; diff --git a/src/relevance.c b/src/relevance.c index 7603a7a..072a894 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -21,6 +21,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #endif +#include #include #include @@ -90,19 +91,34 @@ static struct word_entry *build_word_entries(pp2_charset_t pct, NMEM nmem, } void relevance_countwords(struct relevance *r, struct record_cluster *cluster, - const char *words, int multiplier) + const char *words, int multiplier, const char *name) { pp2_relevance_token_t prt = pp2_relevance_tokenize(r->pct, words); - + int *mult = cluster->term_frequency_vec_tmp; const char *norm_str; - + int i, length = 0; + + for (i = 1; i < r->vec_len; i++) + mult[i] = 0; + while ((norm_str = pp2_relevance_token_next(prt))) { int res = word_entry_match(r->entries, norm_str); if (res) - cluster->term_frequency_vec[res] += multiplier; - cluster->term_frequency_vec[0]++; + { + assert(res < r->vec_len); + mult[res] += multiplier; + } + length++; } + + for (i = 1; i < r->vec_len; i++) + { + cluster->term_frequency_vecf[i] += (double) mult[i] / length; + cluster->term_frequency_vec[i] += mult[i]; + } + + cluster->term_frequency_vec[0] += length; pp2_relevance_token_destroy(prt); } @@ -128,8 +144,26 @@ void relevance_newrec(struct relevance *r, struct record_cluster *rec) { if (!rec->term_frequency_vec) { - rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); - memset(rec->term_frequency_vec, 0, r->vec_len * sizeof(int)); + int i; + + // term frequency [1,..] . [0] is total length of all fields + rec->term_frequency_vec = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vec)); + for (i = 0; i < r->vec_len; i++) + rec->term_frequency_vec[i] = 0; + + // term frequency divided by length of field [1,...] + rec->term_frequency_vecf = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vecf)); + for (i = 0; i < r->vec_len; i++) + rec->term_frequency_vecf[i] = 0.0; + + // for relevance_countwords (so we don't have to xmalloc/xfree) + rec->term_frequency_vec_tmp = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vec_tmp)); } } @@ -182,9 +216,17 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) for (t = 1; t < rel->vec_len; t++) { float termfreq; - if (!rec->term_frequency_vec[0]) - break; - termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0]; +#if 1 + termfreq = (float) rec->term_frequency_vecf[t]; +#else + if (rec->term_frequency_vec[0]) + { + termfreq = (float) + rec->term_frequency_vec[t] / rec->term_frequency_vec[0] ; + } + else + termfreq = 0.0; +#endif relevance += 100000 * (termfreq * idfvec[t] + 0.0000005); } rec->relevance = relevance; diff --git a/src/relevance.h b/src/relevance.h index e271d8c..28f4daa 100644 --- a/src/relevance.h +++ b/src/relevance.h @@ -31,7 +31,7 @@ struct relevance *relevance_create(pp2_charset_t pct, NMEM nmem, const char **terms); void relevance_newrec(struct relevance *r, struct record_cluster *cluster); void relevance_countwords(struct relevance *r, struct record_cluster *cluster, - const char *words, int multiplier); + const char *words, int multiplier, const char *name); void relevance_donerecord(struct relevance *r, struct record_cluster *cluster); void relevance_prepare_read(struct relevance *rel, struct reclist *rec); -- 1.7.10.4