From c33ea56e3771c3b80ba66ef8fda3a09cad171ebb Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Fri, 8 Sep 2006 14:40:51 +0000 Subject: [PATCH] Honor position attribute, i.e. allow first-in-field search. To enable this, "firstinfield 1" must be given for an index in default.idx. Enabled in tab/default.idx for w. At this stage first-in field is only supported for phrase searches (including simple words). --- NEWS | 6 +++ include/zebramap.h | 5 ++- index/extract.c | 18 ++++++-- index/index.h | 5 ++- index/zrpn.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++- rset/rsmultiandor.c | 3 +- tab/default.idx | 3 +- test/api/t9.c | 9 +--- test/marcxml/t1.c | 8 +++- util/zebramap.c | 17 +++++++- 10 files changed, 171 insertions(+), 18 deletions(-) diff --git a/NEWS b/NEWS index 184eae0..e141a43 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +Honor position attribute, i.e. allow first-in-field search. To +enable this, "firstinfield 1" must be given for an index in +default.idx. Enabled in tab/default.idx for w. At this stage +first-in field is only supported for phrase searches (including +simple words). + Common stream reader interface for record filters (struct ZebraRecStream). Debian package fix: packages idzebra-2.0 + libidzebra-2.0-modules did diff --git a/include/zebramap.h b/include/zebramap.h index 2061bc2..0b837cd 100644 --- a/include/zebramap.h +++ b/include/zebramap.h @@ -1,4 +1,4 @@ -/* $Id: zebramap.h,v 1.19 2006-08-15 14:28:32 adam Exp $ +/* $Id: zebramap.h,v 1.20 2006-09-08 14:40:51 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -72,6 +72,9 @@ YAZ_EXPORT int zebra_maps_is_positioned (ZebraMaps zms, unsigned reg_id); YAZ_EXPORT +int zebra_maps_is_first_in_field (ZebraMaps zms, unsigned reg_id); + +YAZ_EXPORT WRBUF zebra_replace(ZebraMaps zms, unsigned reg_id, const char *ex_list, const char *input_str, int input_len); diff --git a/index/extract.c b/index/extract.c index 3874dfe..8189504 100644 --- a/index/extract.c +++ b/index/extract.c @@ -1,4 +1,4 @@ -/* $Id: extract.c,v 1.228 2006-08-22 13:39:27 adam Exp $ +/* $Id: extract.c,v 1.229 2006-09-08 14:40:52 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -44,7 +44,7 @@ struct encode_info { }; static int log_level = 0; -static int log_level_initialized = 1; +static int log_level_initialized = 0; static void zebra_init_log_level() { @@ -1329,7 +1329,7 @@ static void extract_add_string(RecWord *p, const char *string, int length) } } -static void extract_add_incomplete_field (RecWord *p) +static void extract_add_incomplete_field(RecWord *p) { ZebraHandle zh = p->extractCtrl->handle; const char *b = p->term_buf; @@ -1339,6 +1339,15 @@ static void extract_add_incomplete_field (RecWord *p) if (remain > 0) map = zebra_maps_input(zh->reg->zebra_maps, p->index_type, &b, remain, 0); + if (map) + { + if (zebra_maps_is_first_in_field(zh->reg->zebra_maps, p->index_type)) + { + /* first in field marker */ + extract_add_string(p, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN); + p->seqno++; + } + } while (map) { char buf[IT_MAX_WORD+1]; @@ -1440,11 +1449,14 @@ static void extract_token_add(RecWord *p) { ZebraHandle zh = p->extractCtrl->handle; WRBUF wrbuf; + if (log_level) + { yaz_log(log_level, "extract_token_add " "type=%c index=%s seqno=" ZINT_FORMAT " s=%.*s", p->index_type, p->index_name, p->seqno, p->term_len, p->term_buf); + } if ((wrbuf = zebra_replace(zh->reg->zebra_maps, p->index_type, 0, p->term_buf, p->term_len))) { diff --git a/index/index.h b/index/index.h index 5eb29c0..a147563 100644 --- a/index/index.h +++ b/index/index.h @@ -1,4 +1,4 @@ -/* $Id: index.h,v 1.174 2006-08-22 13:39:27 adam Exp $ +/* $Id: index.h,v 1.175 2006-09-08 14:40:52 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -453,6 +453,9 @@ ZEBRA_RES zebra_update_file_match(ZebraHandle zh, const char *path); ZEBRA_RES zebra_update_from_path(ZebraHandle zh, const char *path); ZEBRA_RES zebra_delete_from_path(ZebraHandle zh, const char *path); +#define FIRST_IN_FIELD_STR "\001^" +#define FIRST_IN_FIELD_LEN 2 + YAZ_END_CDECL #endif diff --git a/index/zrpn.c b/index/zrpn.c index 9483fe0..4ae0937 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -1,4 +1,4 @@ -/* $Id: zrpn.c,v 1.227 2006-08-31 08:35:48 adam Exp $ +/* $Id: zrpn.c,v 1.228 2006-09-08 14:40:53 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -1437,6 +1437,93 @@ static ZEBRA_RES term_list_trunc(ZebraHandle zh, return ZEBRA_OK; } +static ZEBRA_RES rpn_search_APT_position(ZebraHandle zh, + Z_AttributesPlusTerm *zapt, + oid_value attributeSet, + int reg_type, + int num_bases, char **basenames, + NMEM rset_nmem, + RSET *rset, + struct rset_key_control *kc) +{ + RSET *f_set; + int base_no; + int position_value; + int num_sets = 0; + AttrType position; + + attr_init_APT(&position, zapt, 3); + position_value = attr_find(&position, NULL); + switch(position_value) + { + case 3: + case -1: + return ZEBRA_OK; + case 1: + case 2: + break; + default: + zebra_setError_zint(zh, YAZ_BIB1_UNSUPP_POSITION_ATTRIBUTE, + position_value); + return ZEBRA_FAIL; + } + + if (!zebra_maps_is_first_in_field(zh->reg->zebra_maps, reg_type)) + { + zebra_setError_zint(zh, YAZ_BIB1_UNSUPP_POSITION_ATTRIBUTE, + position_value); + return ZEBRA_FAIL; + } + + if (!zh->reg->isamb) + { + zebra_setError_zint(zh, YAZ_BIB1_UNSUPP_POSITION_ATTRIBUTE, + position_value); + return ZEBRA_FAIL; + } + f_set = xmalloc(sizeof(RSET) * num_bases); + for (base_no = 0; base_no < num_bases; base_no++) + { + int ord = -1; + char ord_buf[32]; + char term_dict[100]; + int ord_len; + char *val; + ISAM_P isam_p; + + if (zebraExplain_curDatabase (zh->reg->zei, basenames[base_no])) + { + zebra_setError(zh, YAZ_BIB1_DATABASE_UNAVAILABLE, + basenames[base_no]); + return ZEBRA_FAIL; + } + + if (zebra_apt_get_ord(zh, zapt, reg_type, 0, + attributeSet, &ord) != ZEBRA_OK) + continue; + + ord_len = key_SU_encode (ord, ord_buf); + memcpy(term_dict, ord_buf, ord_len); + strcpy(term_dict+ord_len, FIRST_IN_FIELD_STR); + val = dict_lookup(zh->reg->dict, term_dict); + if (!val) + continue; + assert(*val == sizeof(ISAM_P)); + memcpy(&isam_p, val+1, sizeof(isam_p)); + + f_set[num_sets++] = rsisamb_create(rset_nmem, kc, kc->scope, + zh->reg->isamb, isam_p, 0); + + } + if (num_sets) + { + *rset = rset_create_or(rset_nmem, kc, kc->scope, + 0 /* termid */, num_sets, f_set); + } + xfree(f_set); + return ZEBRA_OK; +} + static ZEBRA_RES rpn_search_APT_phrase(ZebraHandle zh, Z_AttributesPlusTerm *zapt, const char *termz_org, @@ -1459,8 +1546,30 @@ static ZEBRA_RES rpn_search_APT_phrase(ZebraHandle zh, num_bases, basenames, rset_nmem, &result_sets, &num_result_sets, kc); + if (res != ZEBRA_OK) return res; + + if (num_result_sets > 0) + { + RSET first_set = 0; + res = rpn_search_APT_position(zh, zapt, attributeSet, + reg_type, + num_bases, basenames, + rset_nmem, &first_set, + kc); + if (res != ZEBRA_OK) + return res; + if (first_set) + { + RSET *nsets = nmem_malloc(stream, + sizeof(RSET) * (num_result_sets+1)); + nsets[0] = first_set; + memcpy(nsets+1, result_sets, sizeof(RSET) * num_result_sets); + result_sets = nsets; + num_result_sets++; + } + } if (num_result_sets == 0) *rset = rset_create_null(rset_nmem, kc, 0); else if (num_result_sets == 1) @@ -2478,6 +2587,10 @@ static int scan_handle (char *name, const char *info, int pos, void *client) else idx = - pos - 1; + /* skip special terms.. of no interest */ + if (name[len_prefix] < 4) + return 1; + if (idx < 0) return 0; scan_info->list[idx].term = (char *) diff --git a/rset/rsmultiandor.c b/rset/rsmultiandor.c index 2714460..6c3145c 100644 --- a/rset/rsmultiandor.c +++ b/rset/rsmultiandor.c @@ -1,4 +1,4 @@ -/* $Id: rsmultiandor.c,v 1.25 2006-08-16 13:14:55 adam Exp $ +/* $Id: rsmultiandor.c,v 1.26 2006-09-08 14:40:55 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -442,7 +442,6 @@ static int r_read_or (RSFD rfd, void *buf, TERMID *term) *term = rset->term; else *term = it->term; - assert(*term); } (mrfd->hits)++; rdres = rset_read(it->fd, it->buf, &it->term); diff --git a/tab/default.idx b/tab/default.idx index 2d2355d..96f128f 100644 --- a/tab/default.idx +++ b/tab/default.idx @@ -1,5 +1,5 @@ # Zebra indexes as referred to from the *.abs-files. -# $Id: default.idx,v 1.13 2006-06-22 09:48:09 adam Exp $ +# $Id: default.idx,v 1.14 2006-09-08 14:40:56 adam Exp $ # # Traditional word index @@ -9,6 +9,7 @@ index w completeness 0 position 1 alwaysmatches 1 +firstinfield 1 charmap string.chr # Phrase index diff --git a/test/api/t9.c b/test/api/t9.c index 5432b37..8cba20a 100644 --- a/test/api/t9.c +++ b/test/api/t9.c @@ -1,4 +1,4 @@ -/* $Id: t9.c,v 1.11 2006-08-14 10:40:22 adam Exp $ +/* $Id: t9.c,v 1.12 2006-09-08 14:40:57 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -34,16 +34,11 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_init_data(zh, recs)); YAZ_CHECK(tl_ranking_query(zh, "@attr 1=4 @attr 2=102 the", - 3, "first title", 1000 )); + 3, "first title", 936 )); YAZ_CHECK(tl_ranking_query(zh, "@attr 1=62 @attr 2=102 foo", 3, "second title", 850 )); - /* get the record with the most significant hit, that is the 'bar' */ - /* as that is the rarest of my search words */ - YAZ_CHECK(tl_ranking_query(zh, "@attr 1=1016 @attr 2=102 @or @or the foo bar", - 3, "third title", 813 )); - YAZ_CHECK(tl_close_down(zh, zs)); } diff --git a/test/marcxml/t1.c b/test/marcxml/t1.c index 4b4958e..da51367 100644 --- a/test/marcxml/t1.c +++ b/test/marcxml/t1.c @@ -1,4 +1,4 @@ -/* $Id: t1.c,v 1.10 2006-08-22 08:11:32 adam Exp $ +/* $Id: t1.c,v 1.11 2006-09-08 14:40:58 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -49,6 +49,12 @@ static void tst(int argc, char **argv) YAZ_CHECK(tl_query(zh, "@attr 1=leader 00366", 2)); YAZ_CHECK(tl_query(zh, "@attr 1=leader2 nam", 2)); + YAZ_CHECK(tl_query(zh, "@attr 1=1003 jack", 2)); + YAZ_CHECK(tl_query(zh, "@attr 1=1003 jack", 2)); + YAZ_CHECK(tl_query(zh, "@attr 1=1003 collins", 2)); + YAZ_CHECK(tl_query(zh, "@attr 1=1003 @attr 3=1 collins", 0)); + YAZ_CHECK(tl_query(zh, "@attr 1=4 @attr 3=1 program", 0)); + YAZ_CHECK(tl_query(zh, "@attr 1=4 @attr 3=1 to", 0)); YAZ_CHECK(tl_close_down(zh, zs)); } diff --git a/util/zebramap.c b/util/zebramap.c index 7dc5d0f..b075257 100644 --- a/util/zebramap.c +++ b/util/zebramap.c @@ -1,4 +1,4 @@ -/* $Id: zebramap.c,v 1.52 2006-08-15 14:28:35 adam Exp $ +/* $Id: zebramap.c,v 1.53 2006-09-08 14:41:00 adam Exp $ Copyright (C) 1995-2006 Index Data ApS @@ -40,6 +40,7 @@ struct zebra_map { int completeness; int positioned; int alwaysmatches; + int first_in_field; int type; union { struct { @@ -125,6 +126,7 @@ ZEBRA_RES zebra_maps_read_file(ZebraMaps zms, const char *fname) (*zm)->completeness = 0; (*zm)->positioned = 1; (*zm)->alwaysmatches = 0; + (*zm)->first_in_field = 0; zms->no_maps++; } else if (!yaz_matchstr(argv[0], "sort")) @@ -142,6 +144,7 @@ ZEBRA_RES zebra_maps_read_file(ZebraMaps zms, const char *fname) (*zm)->completeness = 0; (*zm)->positioned = 0; (*zm)->alwaysmatches = 0; + (*zm)->first_in_field = 0; zms->no_maps++; } else if (!zm) @@ -166,6 +169,10 @@ ZEBRA_RES zebra_maps_read_file(ZebraMaps zms, const char *fname) { (*zm)->alwaysmatches = atoi(argv[1]); } + else if (!yaz_matchstr(argv[0], "firstinfield") && argc == 2) + { + (*zm)->first_in_field = atoi(argv[1]); + } else if (!yaz_matchstr(argv[0], "entrysize") && argc == 2) { if ((*zm)->type == ZEBRA_MAP_TYPE_SORT) @@ -346,6 +353,14 @@ int zebra_maps_is_alwaysmatches(ZebraMaps zms, unsigned reg_id) return 0; } +int zebra_maps_is_first_in_field(ZebraMaps zms, unsigned reg_id) +{ + struct zebra_map *zm = zebra_map_get(zms, reg_id); + if (zm) + return zm->first_in_field; + return 0; +} + int zebra_maps_sort(ZebraMaps zms, Z_SortAttributes *sortAttributes, int *numerical) { -- 1.7.10.4