From 47ea1fc957c7b97bb30a26698f072109cae275e4 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Thu, 25 Jul 2002 13:06:43 +0000 Subject: [PATCH] Character set negotiation updates --- CHANGELOG | 19 ++++- TODO | 7 +- bfile/mfile.c | 4 +- configure.in | 8 +- include/recctrl.h | 6 +- include/rset.h | 9 +- include/zebraver.h | 6 +- index/index.h | 14 ++- index/retrieve.c | 6 +- index/trunc.c | 37 +++++--- index/zebraapi.c | 32 ++++++- index/zebraapi.h | 12 ++- index/zrpn.c | 129 +++++++++++++++++++--------- index/zserver.c | 70 ++++++++++++--- index/zserver.h | 3 +- index/zsets.c | 54 ++++++++++-- recctrl/recgrs.c | 23 +++-- rset/rset.c | 9 +- rset/rsnull.c | 8 +- test/dmoz/plot.dem | 2 +- test/dmoz/zebra-b.cfg | 2 +- test/gils/zebra.cfg | 4 +- util/charmap.c | 225 +++++++++++++++++++++++++++++++++++++++++-------- 23 files changed, 539 insertions(+), 150 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 40ae05a..72338f9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,22 @@ ---- 1.2.0 2002/MM/DD +--- 1.3.0 2002/MM/DD + +Zebra uses UTF-8 internally: + +1) New setting "encoding" for zebra.cfg that specifies encoding for +OCTET terms in queries and record encoding for most transfer syntaxes +(except those that use International Strings, such as GRS-1). + +2) The encoding of International strings is UTF-8 by default. It +may be changed by character set negotiation. If character set +negotiation is in effect and if records are selected for conversion +these'll be converted to the selected character set - thus overriding +the encoding setting in zebra.cfg. + +3) New directive "encoding" in .abs-files. This specifies the +external for files indexed by zebra. However, if records themselves +have an XML header that specifies and encoding that'll be used +instead. XML filter (-t grs.xml). diff --git a/TODO b/TODO index f85e221..c2cf5c7 100644 --- a/TODO +++ b/TODO @@ -1,13 +1,8 @@ Zebra TODO -$Id: TODO,v 1.11 2002-05-03 13:50:24 adam Exp $ +$Id: TODO,v 1.12 2002-07-25 13:06:43 adam Exp $ Make test scripts. -Add XML parser (EXPAT). - -Ensure that UTF-8 indexing works and that EXPAT using -other character set is converted to it. - Handle free lists for isamb (possibly others). Check that recordId: actually works. diff --git a/bfile/mfile.c b/bfile/mfile.c index 8e85e79..957e255 100644 --- a/bfile/mfile.c +++ b/bfile/mfile.c @@ -3,7 +3,7 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * - * $Id: mfile.c,v 1.46 2002-07-16 09:52:20 heikki Exp $ + * $Id: mfile.c,v 1.47 2002-07-25 13:06:43 adam Exp $ */ @@ -248,7 +248,9 @@ MFile_area mf_init(const char *name, const char *spec, const char *base) dent->d_name); return 0; } +#ifndef WIN32 fsync(fd); +#endif close(fd); if (dirp->max_bytes >= 0) dirp->avail_bytes -= part_f->bytes; diff --git a/configure.in b/configure.in index 7f727f5..acea3e3 100644 --- a/configure.in +++ b/configure.in @@ -1,5 +1,5 @@ dnl Zebra, Index Data Aps, 1994-2002 -dnl $Id: configure.in,v 1.37 2002-07-03 10:04:34 adam Exp $ +dnl $Id: configure.in,v 1.38 2002-07-25 13:06:43 adam Exp $ dnl AC_INIT(include/zebraver.h) AC_MSG_CHECKING(for package) @@ -7,12 +7,12 @@ if test -r ${srcdir}/LICENSE.zmbol; then AC_MSG_RESULT([Z'mbol]) PROGPREFIX=zmbol AC_DEFINE(ZMBOL,1) - AM_INIT_AUTOMAKE(zmbol,1.2.0) + AM_INIT_AUTOMAKE(zmbol,1.3.0) else AC_MSG_RESULT([Zebra]) PROGPREFIX=zebra AC_DEFINE(ZMBOL,0) - AM_INIT_AUTOMAKE(zebra,1.2.0) + AM_INIT_AUTOMAKE(zebra,1.3.0) fi AM_CONDITIONAL(ISZMBOL,test $PACKAGE = zmbol) dnl ------ Substitutions @@ -114,7 +114,7 @@ else fi dnl dnl ------ times -AC_CHECK_HEADERS(sys/times.h) +AC_CHECK_HEADERS(sys/times.h iconv.h) dnl ------ mkstemp AC_CHECK_FUNCS(mkstemp) dnl diff --git a/include/recctrl.h b/include/recctrl.h index 72db67e..d992dbc 100644 --- a/include/recctrl.h +++ b/include/recctrl.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: recctrl.h,v $ - * Revision 1.36 2002-07-02 20:20:09 adam + * Revision 1.37 2002-07-25 13:06:43 adam + * Character set negotiation updates + * + * Revision 1.36 2002/07/02 20:20:09 adam * idzebra:{filename,score,size,localnumber} tags for XML * * Revision 1.35 2002/04/13 18:16:42 adam @@ -184,6 +187,7 @@ struct recRetrieveCtrl { off_t (*tellf)(void *fh); oid_value input_format; /* Preferred record syntax */ Z_RecordComposition *comp; /* formatting instructions */ + char *encoding; /* preferred character encoding */ int localno; /* local id of record */ int score; /* score 0-1000 or -1 if none */ int recordSize; /* size of record in bytes */ diff --git a/include/rset.h b/include/rset.h index 65198ed..c604a86 100644 --- a/include/rset.h +++ b/include/rset.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: rset.h,v $ - * Revision 1.17 2002-03-20 20:24:29 adam + * Revision 1.18 2002-07-25 13:06:43 adam + * Character set negotiation updates + * + * Revision 1.17 2002/03/20 20:24:29 adam * Hits per term. Returned in SearchResult-1 * * Revision 1.16 1999/02/02 14:50:38 adam @@ -100,6 +103,7 @@ struct rset_term { int nn; char *flags; int count; + int type; }; typedef struct rset @@ -112,7 +116,8 @@ typedef struct rset int no_rset_terms; } rset; -RSET_TERM rset_term_create (const char *name, int length, const char *flags); +RSET_TERM rset_term_create (const char *name, int length, const char *flags, + int type); void rset_term_destroy (RSET_TERM t); RSET_TERM rset_term_dup (RSET_TERM t); diff --git a/include/zebraver.h b/include/zebraver.h index 62a5c36..2680419 100644 --- a/include/zebraver.h +++ b/include/zebraver.h @@ -2,13 +2,13 @@ * Copyright (C) 1994-2002, Index Data * All rights reserved. * - * $Id: zebraver.h,v 1.20 2002-04-05 08:46:26 adam Exp $ + * $Id: zebraver.h,v 1.21 2002-07-25 13:06:43 adam Exp $ */ #ifndef ZEBRAVER -#define ZEBRAVER "1.2.0" +#define ZEBRAVER "1.3.0" #endif #ifndef ZEBRADATE -#define ZEBRADATE "$Date: 2002-04-05 08:46:26 $" +#define ZEBRADATE "$Date: 2002-07-25 13:06:43 $" #endif diff --git a/index/index.h b/index/index.h index 9bcea72..3e532ff 100644 --- a/index/index.h +++ b/index/index.h @@ -2,7 +2,7 @@ * Copyright (C) 1995-2002, Index Data * All rights reserved. * Sebastian Hammer, Adam Dickmeiss, Heikki Levanto - * $Id: index.h,v 1.83 2002-04-26 08:44:47 adam Exp $ + * $Id: index.h,v 1.84 2002-07-25 13:06:43 adam Exp $ */ #ifndef INDEX_H @@ -18,6 +18,10 @@ #include #endif +#if HAVE_ICONV_H +#include +#endif + #include #include #include @@ -281,7 +285,11 @@ struct zebra_session { int records_updated; int records_deleted; int records_processed; - + char *record_encoding; +#if HAVE_ICONV_H + iconv_t iconv_to_utf8; + iconv_t iconv_from_utf8; +#endif }; struct rank_control { @@ -317,7 +325,7 @@ void rpn_scan (ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, RSET rset_trunc (ZebraHandle zh, ISAMS_P *isam_p, int no, const char *term, int length_term, const char *flags, - int preserve_position); + int preserve_position, int term_type); void resultSetAddTerm (ZebraHandle zh, ZebraSet s, int reg_type, const char *db, int set, diff --git a/index/retrieve.c b/index/retrieve.c index 1ce3f6b..cf0787c 100644 --- a/index/retrieve.c +++ b/index/retrieve.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: retrieve.c,v $ - * Revision 1.18 2002-07-02 20:20:09 adam + * Revision 1.19 2002-07-25 13:06:43 adam + * Character set negotiation updates + * + * Revision 1.18 2002/07/02 20:20:09 adam * idzebra:{filename,score,size,localnumber} tags for XML * * Revision 1.17 2002/05/03 13:49:04 adam @@ -215,6 +218,7 @@ int zebra_record_fetch (ZebraHandle zh, int sysno, int score, ODR stream, retrieveCtrl.odr = stream; retrieveCtrl.input_format = retrieveCtrl.output_format = input_format; retrieveCtrl.comp = comp; + retrieveCtrl.encoding = zh->record_encoding; retrieveCtrl.diagnostic = 0; retrieveCtrl.dh = zh->reg->dh; retrieveCtrl.res = zh->res; diff --git a/index/trunc.c b/index/trunc.c index 8d97ee0..ba3f18f 100644 --- a/index/trunc.c +++ b/index/trunc.c @@ -3,7 +3,7 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss, Heikki Levanto * - * $Id: trunc.c,v 1.25 2002-07-12 18:12:22 heikki Exp $ + * $Id: trunc.c,v 1.26 2002-07-25 13:06:43 adam Exp $ * */ #include @@ -121,7 +121,8 @@ static void heap_close (struct trunc_info *ti) static RSET rset_trunc_r (ZebraHandle zi, const char *term, int length, const char *flags, ISAMS_P *isam_p, int from, int to, - int merge_chunk, int preserve_position) + int merge_chunk, int preserve_position, + int term_type) { RSET result; RSFD result_rsfd; @@ -130,7 +131,7 @@ static RSET rset_trunc_r (ZebraHandle zi, const char *term, int length, parms.cmp = key_compare_it; parms.key_size = sizeof(struct it_key); parms.temp_path = res_get (zi->res, "setTmpDir"); - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, term_type); result = rset_create (rset_kind_temp, &parms); result_rsfd = rset_open (result, RSETF_WRITE); @@ -152,11 +153,13 @@ static RSET rset_trunc_r (ZebraHandle zi, const char *term, int length, if (i_add <= to - i) rset[rscur] = rset_trunc_r (zi, term, length, flags, isam_p, i, i+i_add, - merge_chunk, preserve_position); + merge_chunk, preserve_position, + term_type); else rset[rscur] = rset_trunc_r (zi, term, length, flags, isam_p, i, to, - merge_chunk, preserve_position); + merge_chunk, preserve_position, + term_type); rscur++; } ti = heap_init (rscur, sizeof(struct it_key), key_compare_it); @@ -509,13 +512,13 @@ static int isamd_trunc_cmp (const void *p1, const void *p2) RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, const char *term, int length, const char *flags, - int preserve_position) + int preserve_position, int term_type) { logf (LOG_DEBUG, "rset_trunc no=%d", no); if (no < 1) { rset_null_parms parms; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, term_type); return rset_create (rset_kind_null, &parms); } if (zi->reg->isams) @@ -526,7 +529,8 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, parms.pos = *isam_p; parms.is = zi->reg->isams; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, + term_type); return rset_create (rset_kind_isams, &parms); } qsort (isam_p, no, sizeof(*isam_p), isams_trunc_cmp); @@ -539,7 +543,8 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, parms.pos = *isam_p; parms.is = zi->reg->isam; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, + term_type); return rset_create (rset_kind_isam, &parms); } qsort (isam_p, no, sizeof(*isam_p), isam_trunc_cmp); @@ -554,7 +559,8 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, parms.cmp = key_compare_it; parms.pos = *isam_p; parms.is = zi->reg->isamc; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, + term_type); return rset_create (rset_kind_isamc, &parms); } #if NEW_TRUNC @@ -568,7 +574,8 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, parms.isam_positions = isam_p; parms.no_isam_positions = no; parms.no_save_positions = 100000; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, + term_type); return rset_create (rset_kind_m_or, &parms); } #endif @@ -584,7 +591,8 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, abort(); /* parms.pos = *isam_p; */ parms.is = zi->reg->isamd; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, + term_type); return rset_create (rset_kind_isamd, &parms); } #if NEW_TRUNC_NOT_DONE_FOR_ISAM_D @@ -615,7 +623,8 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, parms.cmp = key_compare_it; parms.pos = *isam_p; parms.is = zi->reg->isamb; - parms.rset_term = rset_term_create (term, length, flags); + parms.rset_term = rset_term_create (term, length, flags, + term_type); return rset_create (rset_kind_isamb, &parms); } qsort (isam_p, no, sizeof(*isam_p), isamd_trunc_cmp); @@ -626,6 +635,6 @@ RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no, return rset_create (rset_kind_null, NULL); } return rset_trunc_r (zi, term, length, flags, isam_p, 0, no, 100, - preserve_position); + preserve_position, term_type); } diff --git a/index/zebraapi.c b/index/zebraapi.c index 29ad2f4..fabd5e4 100644 --- a/index/zebraapi.c +++ b/index/zebraapi.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2002, Index Data * All rights reserved. * - * $Id: zebraapi.c,v 1.62 2002-07-15 11:50:45 adam Exp $ + * $Id: zebraapi.c,v 1.63 2002-07-25 13:06:43 adam Exp $ */ #include @@ -53,6 +53,7 @@ static void zebra_register_close (ZebraService zs, struct zebra_register *reg); ZebraHandle zebra_open (ZebraService zs) { ZebraHandle zh; + const char *default_encoding; if (!zs) return 0; @@ -82,6 +83,21 @@ ZebraHandle zebra_open (ZebraService zs) zh->shadow_enable = 1; + default_encoding = res_get_def(zs->global_res, "encoding", "ISO-8859-1"); + zh->record_encoding = xstrdup (default_encoding); +#if HAVE_ICONV_H + zh->iconv_to_utf8 = + iconv_open ("UTF-8", default_encoding); + if (zh->iconv_to_utf8 == (iconv_t)(-1)) + yaz_log (LOG_WARN, "iconv: %s to UTF-8 unsupported", + default_encoding); + zh->iconv_from_utf8 = + iconv_open (default_encoding, "UTF-8"); + if (zh->iconv_to_utf8 == (iconv_t)(-1)) + yaz_log (LOG_WARN, "iconv: UTF-8 to %s unsupported", + default_encoding); +#endif + zebra_mutex_cond_lock (&zs->session_lock); zh->next = zs->sessions; @@ -397,6 +413,14 @@ void zebra_close (ZebraHandle zh) zebra_register_close (zh->service, zh->reg); zebra_close_res (zh); + xfree (zh->record_encoding); +#if HAVE_ICONV_H + if (zh->iconv_to_utf8 != (iconv_t) (-1)) + iconv_close (zh->iconv_to_utf8); + if (zh->iconv_from_utf8 != (iconv_t) (-1)) + iconv_close (zh->iconv_from_utf8); +#endif + xfree (zh->admin_databaseName); zebra_mutex_cond_lock (&zs->session_lock); zebra_lock_destroy (zh->lock_normal); @@ -1304,3 +1328,9 @@ void zebra_shadow_enable (ZebraHandle zh, int value) zh->shadow_enable = value; } +int zebra_record_encoding (ZebraHandle zh, const char *encoding) +{ + xfree (zh->record_encoding); + zh->record_encoding = xstrdup (encoding); + return 0; +} diff --git a/index/zebraapi.h b/index/zebraapi.h index 2551093..b1463b8 100644 --- a/index/zebraapi.h +++ b/index/zebraapi.h @@ -3,7 +3,7 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * - * $Id: zebraapi.h,v 1.17 2002-07-11 13:03:01 heikki Exp $ + * $Id: zebraapi.h,v 1.18 2002-07-25 13:06:43 adam Exp $ */ #ifndef ZEBRAAPI_H @@ -130,9 +130,10 @@ YAZ_EXPORT void zebra_set_group (ZebraHandle zh, struct recordGroup *rg); YAZ_EXPORT void zebra_result (ZebraHandle zh, int *code, char **addinfo); -YAZ_EXPORT const char *zebra_resultSetTerms (ZebraHandle zh, - const char *setname, - int no, int *count, int *no_max); + +YAZ_EXPORT int zebra_resultSetTerms (ZebraHandle zh, const char *setname, + int no, int *count, + int *type, char *out, size_t *len); YAZ_EXPORT void zebra_sort (ZebraHandle zh, ODR stream, int num_input_setnames, @@ -155,5 +156,8 @@ void zebra_shadow_enable (ZebraHandle zh, int value); YAZ_EXPORT void zebra_register_statistics (ZebraHandle zh, int dumpdict); +YAZ_EXPORT +int zebra_record_encoding (ZebraHandle zh, const char *encoding); + YAZ_END_CDECL #endif diff --git a/index/zrpn.c b/index/zrpn.c index 846c1ed..88dbefc 100644 --- a/index/zrpn.c +++ b/index/zrpn.c @@ -3,7 +3,7 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * - * $Id: zrpn.c,v 1.116 2002-07-03 10:05:19 adam Exp $ + * $Id: zrpn.c,v 1.117 2002-07-25 13:06:43 adam Exp $ */ #include #include @@ -250,6 +250,8 @@ static int term_pre (ZebraMaps zebra_maps, int reg_type, const char **src, return *s0; } +#define REGEX_CHARS "[]()|.*+!" + /* term_100: handle term, where trunc=none (no operators at all) */ static int term_100 (ZebraMaps zebra_maps, int reg_type, const char **src, char *dst, int space_split, @@ -287,7 +289,7 @@ static int term_100 (ZebraMaps zebra_maps, int reg_type, { /* reload last space */ while (space_start < space_end) { - if (!isalnum (*space_start) && *space_start != '-') + if (strchr (REGEX_CHARS, *space_start)) dst[i++] = '\\'; dst_term[j++] = *space_start; dst[i++] = *space_start++; @@ -299,7 +301,7 @@ static int term_100 (ZebraMaps zebra_maps, int reg_type, /* add non-space char */ while (s1 < s0) { - if (!isalnum (*s1) && *s1 != '-') + if (strchr(REGEX_CHARS, *s1)) dst[i++] = '\\'; dst_term[j++] = *s1; dst[i++] = *s1++; @@ -340,7 +342,7 @@ static int term_101 (ZebraMaps zebra_maps, int reg_type, break; while (s1 < s0) { - if (!isalnum (*s1)) + if (strchr(REGEX_CHARS, *s1)) dst[i++] = '\\'; dst_term[j++] = *s1; dst[i++] = *s1++; @@ -389,7 +391,7 @@ static int term_103 (ZebraMaps zebra_maps, int reg_type, const char **src, break; while (s1 < s0) { - if (!isalnum (*s1)) + if (strchr(REGEX_CHARS, *s1)) dst[i++] = '\\'; dst_term[j++] = *s1; dst[i++] = *s1++; @@ -444,7 +446,7 @@ static int term_104 (ZebraMaps zebra_maps, int reg_type, break; while (s1 < s0) { - if (!isalnum (*s1)) + if (strchr(REGEX_CHARS, *s1)) dst[i++] = '\\'; dst_term[j++] = *s1; dst[i++] = *s1++; @@ -490,7 +492,7 @@ static int term_105 (ZebraMaps zebra_maps, int reg_type, break; while (s1 < s0) { - if (!isalnum (*s1)) + if (strchr(REGEX_CHARS, *s1)) dst[i++] = '\\'; dst_term[j++] = *s1; dst[i++] = *s1++; @@ -824,7 +826,8 @@ static RSET term_trunc (ZebraHandle zh, Z_AttributesPlusTerm *zapt, logf (LOG_DEBUG, "term: %s", term_dst); return rset_trunc (zh, grep_info->isam_p_buf, grep_info->isam_p_indx, term_dst, - strlen(term_dst), rank_type, 1 /* preserve pos */); + strlen(term_dst), rank_type, 1 /* preserve pos */, + zapt->term->which); } @@ -1071,17 +1074,55 @@ static int string_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt, return 1; } -static void trans_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt, - char *termz) + +static int trans_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt, + char *termz) { size_t sizez; Z_Term *term = zapt->term; - sizez = term->u.general->len; - if (sizez > IT_MAX_WORD-1) - sizez = IT_MAX_WORD-1; - memcpy (termz, term->u.general->buf, sizez); - termz[sizez] = '\0'; + switch (term->which) + { + case Z_Term_general: +#if HAVE_ICONV_H + if (zh->iconv_to_utf8 != (iconv_t)(-1)) + { + char *inbuf = term->u.general->buf; + size_t inleft = term->u.general->len; + char *outbuf = termz; + size_t outleft = IT_MAX_WORD-1; + size_t ret; + + yaz_log (LOG_DEBUG, "converting general from ISO-8859-1"); + ret = iconv(zh->iconv_to_utf8, &inbuf, &inleft, + &outbuf, &outleft); + if (ret == (size_t)(-1)) + { + ret = iconv(zh->iconv_to_utf8, 0, 0, 0, 0); + zh->errCode = 125; + return -1; + } + *outbuf = 0; + return 0; + } +#endif + sizez = term->u.general->len; + if (sizez > IT_MAX_WORD-1) + sizez = IT_MAX_WORD-1; + memcpy (termz, term->u.general->buf, sizez); + termz[sizez] = '\0'; + break; + case Z_Term_characterString: + sizez = strlen(term->u.characterString); + if (sizez > IT_MAX_WORD-1) + sizez = IT_MAX_WORD-1; + memcpy (termz, term->u.characterString, sizez); + termz[sizez] = '\0'; + break; + default: + zh->errCode = 124; + } + return 0; } static void trans_scan_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt, @@ -1126,6 +1167,7 @@ static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no, int length_prox_term = 0; int min_nn = 10000000; int term_index; + int term_type = Z_Term_characterString; const char *flags = NULL; rsfd = (RSFD *) xmalloc (sizeof(*rsfd)*rset_no); @@ -1151,6 +1193,12 @@ static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no, if (min_nn > rset[i]->rset_terms[j]->nn) min_nn = rset[i]->rset_terms[j]->nn; flags = nflags; + term_type = rset[i]->rset_terms[j]->type; + + /* only if all term types are of type characterString .. */ + /* the resulting term is of that type */ + if (term_type != Z_Term_characterString) + term_type = Z_Term_general; } } for (i = 0; inn = 0; result = rset_create (rset_kind_null, &parms); } @@ -1182,7 +1230,7 @@ static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no, RSFD rsfd_result; parms.rset_term = rset_term_create (prox_term, length_prox_term, - flags); + flags, term_type); parms.rset_term->nn = min_nn; parms.cmp = key_compare_it; parms.key_size = sizeof (struct it_key); @@ -1241,7 +1289,7 @@ static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no, logf (LOG_LOG, "generic prox, dist = %d, relation = %d, ordered =%d, exclusion=%d", distance, relation, ordered, exclusion); parms.rset_term = rset_term_create (prox_term, length_prox_term, - flags); + flags, term_type); parms.rset_term->nn = min_nn; parms.cmp = key_compare_it; parms.key_size = sizeof (struct it_key); @@ -1321,7 +1369,7 @@ static RSET rpn_prox (ZebraHandle zh, RSET *rset, int rset_no, rset_null_parms parms; parms.rset_term = rset_term_create (prox_term, length_prox_term, - flags); + flags, term_type); parms.rset_term->nn = 0; result = rset_create (rset_kind_null, &parms); } @@ -1474,7 +1522,8 @@ static RSET rpn_search_APT_phrase (ZebraHandle zh, { rset_null_parms parms; - parms.rset_term = rset_term_create (termz, -1, rank_type); + parms.rset_term = rset_term_create (termz, -1, rank_type, + zapt->term->which); return rset_create (rset_kind_null, &parms); } else if (rset_no == 1) @@ -1521,7 +1570,8 @@ static RSET rpn_search_APT_or_list (ZebraHandle zh, { rset_null_parms parms; - parms.rset_term = rset_term_create (termz, -1, rank_type); + parms.rset_term = rset_term_create (termz, -1, rank_type, + zapt->term->which); return rset_create (rset_kind_null, &parms); } result = rset[0]; @@ -1575,7 +1625,8 @@ static RSET rpn_search_APT_and_list (ZebraHandle zh, { rset_null_parms parms; - parms.rset_term = rset_term_create (termz, -1, rank_type); + parms.rset_term = rset_term_create (termz, -1, rank_type, + zapt->term->which); return rset_create (rset_kind_null, &parms); } result = rset[0]; @@ -1767,7 +1818,8 @@ static RSET rpn_search_APT_numeric (ZebraHandle zh, rset[rset_no] = rset_trunc (zh, grep_info.isam_p_buf, grep_info.isam_p_indx, term_dst, strlen(term_dst), rank_type, - 0 /* preserve position */); + 0 /* preserve position */, + zapt->term->which); assert (rset[rset_no]); if (++rset_no >= (int) (sizeof(rset)/sizeof(*rset))) break; @@ -1777,7 +1829,8 @@ static RSET rpn_search_APT_numeric (ZebraHandle zh, { rset_null_parms parms; - parms.rset_term = rset_term_create (term_dst, -1, rank_type); + parms.rset_term = rset_term_create (term_dst, -1, rank_type, + zapt->term->which); return rset_create (rset_kind_null, &parms); } result = rset[0]; @@ -1805,7 +1858,8 @@ static RSET rpn_search_APT_local (ZebraHandle zh, Z_AttributesPlusTerm *zapt, struct it_key key; rset_temp_parms parms; - parms.rset_term = rset_term_create (termz, -1, rank_type); + parms.rset_term = rset_term_create (termz, -1, rank_type, + zapt->term->which); parms.cmp = key_compare_it; parms.key_size = sizeof (struct it_key); parms.temp_path = res_get (zh->res, "setTmpDir"); @@ -1918,7 +1972,8 @@ static RSET rpn_sort_spec (ZebraHandle zh, Z_AttributesPlusTerm *zapt, sort_sequence->specs[i] = sks; - parms.rset_term = rset_term_create (termz, -1, rank_type); + parms.rset_term = rset_term_create (termz, -1, rank_type, + zapt->term->which); return rset_create (rset_kind_null, &parms); } @@ -2001,7 +2056,7 @@ static RSET rpn_search_xpath (ZebraHandle zh, Z_AttributesPlusTerm *zapt, rset_start_tag = rset_trunc (zh, grep_info.isam_p_buf, grep_info.isam_p_indx, use_string, strlen(use_string), - rank_type, 1); + rank_type, 1, zapt->term->which); prefix_len = 0; ord = zebraExplain_lookupSU (zh->reg->zei, curAttributeSet, 2); @@ -2035,7 +2090,7 @@ static RSET rpn_search_xpath (ZebraHandle zh, Z_AttributesPlusTerm *zapt, rset_end_tag = rset_trunc (zh, grep_info.isam_p_buf, grep_info.isam_p_indx, use_string, strlen(use_string), - rank_type, 1); + rank_type, 1, zapt->term->which); parms.key_size = sizeof(struct it_key); parms.cmp = key_compare_it; @@ -2074,12 +2129,8 @@ static RSET rpn_search_APT (ZebraHandle zh, Z_AttributesPlusTerm *zapt, logf (LOG_DEBUG, "search_type=%s", search_type); logf (LOG_DEBUG, "rank_type=%s", rank_type); - if (zapt->term->which != Z_Term_general) - { - zh->errCode = 124; - return NULL; - } - trans_term (zh, zapt, termz); + if (trans_term (zh, zapt, termz)) + return 0; if (sort_flag) return rpn_sort_spec (zh, zapt, attributeSet, stream, sort_sequence, @@ -2524,7 +2575,7 @@ void rpn_scan (ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, &glist[i+before].term, mterm); rset = rset_trunc (zh, &scan_info_array[j0].list[ptr[j0]].isam_p, 1, glist[i+before].term, strlen(glist[i+before].term), - NULL, 0); + NULL, 0, zapt->term->which); ptr[j0]++; for (j = j0+1; jterm->which); bool_parms.key_size = sizeof(struct it_key); bool_parms.cmp = key_compare_it; @@ -2590,7 +2642,7 @@ void rpn_scan (ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, rset = rset_trunc (zh, &scan_info_array[j0].list[before-1-ptr[j0]].isam_p, 1, glist[before-1-i].term, strlen(glist[before-1-i].term), - NULL, 0); + NULL, 0, zapt->term->which); ptr[j0]++; @@ -2606,7 +2658,8 @@ void rpn_scan (ZebraHandle zh, ODR stream, Z_AttributesPlusTerm *zapt, rset2 = rset_trunc (zh, &scan_info_array[j].list[before-1-ptr[j]].isam_p, 1, glist[before-1-i].term, - strlen(glist[before-1-i].term), NULL, 0); + strlen(glist[before-1-i].term), NULL, 0, + zapt->term->which); bool_parms.key_size = sizeof(struct it_key); bool_parms.cmp = key_compare_it; diff --git a/index/zserver.c b/index/zserver.c index 4664c56..6b34587 100644 --- a/index/zserver.c +++ b/index/zserver.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2002, Index Data * All rights reserved. * - * $Id: zserver.c,v 1.88 2002-05-07 11:05:19 adam Exp $ + * $Id: zserver.c,v 1.89 2002-07-25 13:06:43 adam Exp $ */ #include @@ -89,6 +89,38 @@ bend_initresult *bend_init (bend_initrequest *q) return r; } r->handle = zh; + if (q->charneg_request) /* characater set and langauge negotiation? */ + { + char **charsets = 0; + int num_charsets; + char **langs = 0; + int num_langs = 0; + int selected = 0; + int i; + + NMEM nmem = nmem_create (); + yaz_log (LOG_LOG, "character set and language negotiation"); + + yaz_get_proposal_charneg (nmem, q->charneg_request, + &charsets, &num_charsets, + &langs, &num_langs, &selected); + for (i = 0; i < num_charsets; i++) + { + yaz_log (LOG_LOG, "charset %d %s", i, charsets[i]); + + if (odr_set_charset (q->decode, "UTF-8", charsets[i]) == 0) + { + odr_set_charset (q->stream, charsets[i], "UTF-8"); + if (selected) + zebra_record_encoding (zh, charsets[i]); + q->charneg_response = + yaz_set_response_charneg (q->stream, charsets[i], + 0, selected); + break; + } + } + nmem_destroy (nmem); + } return r; } @@ -97,11 +129,12 @@ static void search_terms (ZebraHandle zh, bend_search_rr *r) int count; int no_terms; int i; + int type; struct Z_External *ext; Z_SearchInfoReport *sr; /* get no of terms for result set */ - zebra_resultSetTerms (zh, r->setname, -1, &count, &no_terms); + no_terms = zebra_resultSetTerms (zh, r->setname, 0, 0, 0, 0, 0); if (!no_terms) return; @@ -129,8 +162,10 @@ static void search_terms (ZebraHandle zh, bend_search_rr *r) for (i = 0; isetname, i, - &count, &no_terms); + char outbuf[1024]; + size_t len = sizeof(outbuf); + zebra_resultSetTerms (zh, r->setname, i, + &count, &type, outbuf, &len); sr->elements[i] = odr_malloc (r->stream, sizeof(**sr->elements)); sr->elements[i]->subqueryId = 0; @@ -145,14 +180,25 @@ static void search_terms (ZebraHandle zh, bend_search_rr *r) odr_malloc (r->stream, sizeof(Z_QueryExpressionTerm)); term = odr_malloc (r->stream, sizeof(Z_Term)); sr->elements[i]->subqueryExpression->u.term->queryTerm = term; - - term->which = Z_Term_general; - term->u.general = odr_malloc (r->stream, sizeof(Odr_oct)); - term->u.general->buf = odr_strdup (r->stream, termz); - - term->u.general->len = strlen (termz); - term->u.general->size = strlen (termz); - + switch (type) + { + case Z_Term_characterString: + yaz_log (LOG_LOG, "term as characterString"); + term->which = Z_Term_characterString; + term->u.characterString = odr_strdup (r->stream, outbuf); + break; + case Z_Term_general: + yaz_log (LOG_LOG, "term as general"); + term->which = Z_Term_general; + term->u.general = odr_malloc (r->stream, sizeof(*term->u.general)); + term->u.general->size = term->u.general->len = len; + term->u.general->buf = odr_malloc (r->stream, len); + memcpy (term->u.general->buf, outbuf, len); + break; + default: + term->which = Z_Term_general; + term->u.null = odr_nullval(); + } sr->elements[i]->subqueryExpression->u.term->termComment = 0; sr->elements[i]->subqueryInterpretation = 0; sr->elements[i]->subqueryRecommendation = 0; diff --git a/index/zserver.h b/index/zserver.h index fde032a..ad43cab 100644 --- a/index/zserver.h +++ b/index/zserver.h @@ -3,10 +3,11 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * - * $Id: zserver.h,v 1.55 2002-04-04 14:14:13 adam Exp $ + * $Id: zserver.h,v 1.56 2002-07-25 13:06:43 adam Exp $ */ #include +#include #include "zebraapi.h" YAZ_BEGIN_CDECL diff --git a/index/zsets.c b/index/zsets.c index ed43fab..d3fb047 100644 --- a/index/zsets.c +++ b/index/zsets.c @@ -3,7 +3,7 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * - * $Id: zsets.c,v 1.36 2002-04-18 20:22:09 adam Exp $ + * $Id: zsets.c,v 1.37 2002-07-25 13:06:43 adam Exp $ */ #include #include @@ -119,20 +119,56 @@ void resultSetAddTerm (ZebraHandle zh, ZebraSet s, int reg_type, } -const char *zebra_resultSetTerms (ZebraHandle zh, const char *setname, - int no, int *count, int *no_max) +int zebra_resultSetTerms (ZebraHandle zh, const char *setname, + int no, int *count, + int *type, char *out, size_t *len) { ZebraSet s = resultSetGet (zh, setname); + int no_max = 0; - *count = 0; - *no_max = 0; + if (count) + *count = 0; if (!s || !s->rset) return 0; - *no_max = s->rset->no_rset_terms; - if (no < 0 || no >= *no_max) + no_max = s->rset->no_rset_terms; + if (no < 0 || no >= no_max) return 0; - *count = s->rset->rset_terms[no]->count; - return s->rset->rset_terms[no]->name; + if (count) + *count = s->rset->rset_terms[no]->count; + if (type) + *type = s->rset->rset_terms[no]->type; + + if (out) + { + char *inbuf = s->rset->rset_terms[no]->name; + size_t inleft = strlen(inbuf); + size_t outleft = *len - 1; + int converted = 0; +#if HAVE_ICONV_H + if (zh->iconv_from_utf8 != (iconv_t)(-1)) + { + char *outbuf = out; + size_t ret; + + ret = iconv(zh->iconv_from_utf8, &inbuf, &inleft, + &outbuf, &outleft); + if (ret == (size_t)(-1)) + *len = 0; + else + *len = outbuf - out; + converted = 1; + } +#endif + if (!converted) + { + if (inleft > outleft) + inleft = outleft; + *len = inleft; + memcpy (out, inbuf, *len); + } + out[*len] = 0; + } + return no_max; } diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index b832911..7767bdf 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -2,7 +2,7 @@ * Copyright (C) 1994-2002, Index Data * All rights reserved. * - * $Id: recgrs.c,v 1.54 2002-07-05 16:07:02 adam Exp $ + * $Id: recgrs.c,v 1.55 2002-07-25 13:06:44 adam Exp $ */ #include @@ -396,10 +396,13 @@ static int grs_extract_sub(struct grs_handlers *h, struct recExtractCtrl *p, if ((oid_ent_to_oid (&oe, oidtmp))) (*p->schemaAdd)(p, oidtmp); } + + /* ensure our data1 tree is UTF-8 */ + data1_iconv (p->dh, mem, n, "UTF-8", data1_get_encoding(p->dh, n)); + #if 0 data1_pr_tree (p->dh, n, stdout); #endif - data1_iconv (p->dh, mem, n, "ISO-8859-1", "UTF-8"); (*p->init)(p, &wrd); if (dumpkeys(n, p, 0, &wrd) < 0) @@ -558,6 +561,9 @@ static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p) nmem_destroy (mem); return 0; } + /* ensure our data1 tree is UTF-8 */ + data1_iconv (p->dh, mem, node, "UTF-8", data1_get_encoding(p->dh, node)); + #if 0 data1_pr_tree (p->dh, node, stdout); #endif @@ -708,20 +714,19 @@ static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p) else if (p->comp && !res) selected = 1; -#if 0 - data1_pr_tree (p->dh, node, stdout); -#endif #if 1 - data1_iconv (p->dh, mem, node, "ISO-8859-1", "UTF-8"); + data1_pr_tree (p->dh, node, stdout); #endif logf (LOG_DEBUG, "grs_retrieve: transfer syntax mapping"); switch (p->output_format = (p->input_format != VAL_NONE ? p->input_format : VAL_SUTRS)) { - case VAL_TEXT_XML: add_idzebra_info (p, top, mem); + if (p->encoding) + data1_iconv (p->dh, mem, node, p->encoding, "UTF-8"); + if (!(p->rec_buf = data1_nodetoidsgml(p->dh, node, selected, &p->rec_len))) p->diagnostic = 238; @@ -755,6 +760,8 @@ static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p) p->rec_len = (size_t) (-1); break; case VAL_SUTRS: + if (p->encoding) + data1_iconv (p->dh, mem, node, p->encoding, "UTF-8"); if (!(p->rec_buf = data1_nodetobuf(p->dh, node, selected, &p->rec_len))) p->diagnostic = 238; @@ -791,6 +798,8 @@ static int grs_retrieve(void *clientData, struct recRetrieveCtrl *p) p->diagnostic = 238; break; } + if (p->encoding) + data1_iconv (p->dh, mem, node, p->encoding, "UTF-8"); if (!(p->rec_buf = data1_nodetomarc(p->dh, marctab, node, selected, &p->rec_len))) p->diagnostic = 238; diff --git a/rset/rset.c b/rset/rset.c index 93fe8df..29c8060 100644 --- a/rset/rset.c +++ b/rset/rset.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: rset.c,v $ - * Revision 1.15 2002-03-20 20:24:30 adam + * Revision 1.16 2002-07-25 13:06:44 adam + * Character set negotiation updates + * + * Revision 1.15 2002/03/20 20:24:30 adam * Hits per term. Returned in SearchResult-1 * * Revision 1.14 1999/05/26 07:49:14 adam @@ -105,7 +108,8 @@ RSET_TERM *rset_terms(RSET rs, int *no) return rs->rset_terms; } -RSET_TERM rset_term_create (const char *name, int length, const char *flags) +RSET_TERM rset_term_create (const char *name, int length, const char *flags, + int type) { RSET_TERM t = (RSET_TERM) xmalloc (sizeof(*t)); if (!name) @@ -124,6 +128,7 @@ RSET_TERM rset_term_create (const char *name, int length, const char *flags) t->flags = xstrdup (flags); t->nn = -1; t->count = 0; + t->type = type; return t; } diff --git a/rset/rsnull.c b/rset/rsnull.c index 94d9be8..c54f364 100644 --- a/rset/rsnull.c +++ b/rset/rsnull.c @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: rsnull.c,v $ - * Revision 1.13 2002-03-21 10:25:42 adam + * Revision 1.14 2002-07-25 13:06:44 adam + * Character set negotiation updates + * + * Revision 1.13 2002/03/21 10:25:42 adam * use lockDir. Fixes for searchResult for null/sort sets * * Revision 1.12 1999/05/26 07:49:14 adam @@ -92,7 +95,8 @@ static void *r_create(RSET ct, const struct rset_control *sel, void *parms) if (parms && null_parms->rset_term) ct->rset_terms[0] = null_parms->rset_term; else - ct->rset_terms[0] = rset_term_create ("term", -1, "rank-0"); + ct->rset_terms[0] = rset_term_create ("term", -1, "rank-0", + 0); ct->rset_terms[0]->nn = 0; return NULL; diff --git a/test/dmoz/plot.dem b/test/dmoz/plot.dem index 4e68d22..79da980 100755 --- a/test/dmoz/plot.dem +++ b/test/dmoz/plot.dem @@ -1,6 +1,6 @@ set output "times-b.ps" set terminal postscript -set title "ISAM-b Mon Jul 15 13:16:34 CEST 2002" +set title "ISAM-b Mon Jul 15 14:06:44 CEST 2002" set xlabel "runs" set ylabel "seconds" plot [0:] [0:] 'times-b.log' using 2 title 'real' with linespoints, 'times-b.log' using 3 title 'user' with linespoints, 'times-b.log' using 4 title 'sys' with linespoints diff --git a/test/dmoz/zebra-b.cfg b/test/dmoz/zebra-b.cfg index df9ef73..3dde62e 100644 --- a/test/dmoz/zebra-b.cfg +++ b/test/dmoz/zebra-b.cfg @@ -13,5 +13,5 @@ recordtype: grs.sgml notimestamps: 1 -isam: b +isam: null register: reg-b:2G diff --git a/test/gils/zebra.cfg b/test/gils/zebra.cfg index 1acf688..443c97d 100644 --- a/test/gils/zebra.cfg +++ b/test/gils/zebra.cfg @@ -1,5 +1,5 @@ # Simple Zebra configuration file -# $Id: zebra.cfg,v 1.16 2002-05-07 11:04:37 adam Exp $ +# $Id: zebra.cfg,v 1.17 2002-07-25 13:06:44 adam Exp $ # # Where the schema files, attribute files, etc are located. profilePath: .:../../tab:../../../yaz/tab @@ -14,4 +14,4 @@ recordtype: grs.sgml #storekeys: 1 #storedata: 1 #recordId: (bib1,identifier-standard) -isam: b +isam: c diff --git a/util/charmap.c b/util/charmap.c index a9ddf72..823fe6a 100644 --- a/util/charmap.c +++ b/util/charmap.c @@ -3,7 +3,7 @@ * All rights reserved. * Sebastian Hammer, Adam Dickmeiss * - * $Id: charmap.c,v 1.22 2002-05-03 13:46:05 adam Exp $ + * $Id: charmap.c,v 1.23 2002-07-25 13:06:44 adam Exp $ * */ @@ -16,9 +16,23 @@ #include #include +#if HAVE_ICONV_H +#include +#else +typedef int iconv_t; +static size_t iconv(iconv_t t, char **buf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ + return -1; +} +#endif + +typedef unsigned ucs4_t; + #include #include + #define CHR_MAXSTR 1024 #define CHR_MAXEQUIV 32 @@ -190,6 +204,51 @@ unsigned char zebra_prim(char **s) yaz_log (LOG_DEBUG, "prim %.3s", *s); if (**s == '\\') { + (*s)++; + c = **s; + switch (c) + { + case '\\': c = '\\'; (*s)++; break; + case 'r': c = '\r'; (*s)++; break; + case 'n': c = '\n'; (*s)++; break; + case 't': c = '\t'; (*s)++; break; + case 's': c = ' '; (*s)++; break; + case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sscanf(*s, "%3o", &i); + c = i; + *s += 3; + break; + default: + (*s)++; + } + } + else + { + c = **s; + ++(*s); + } + return c; +} + +ucs4_t zebra_prim_w(ucs4_t **s) +{ + ucs4_t c; + ucs4_t i = 0; + char fmtstr[8]; + + yaz_log (LOG_DEBUG, "prim %.3s", (char *) *s); + if (**s == '\\') + { (*s)++; c = **s; switch (c) @@ -199,7 +258,14 @@ unsigned char zebra_prim(char **s) case 'n': c = '\n'; (*s)++; break; case 't': c = '\t'; (*s)++; break; case 's': c = ' '; (*s)++; break; - case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break; + case 'x': + fmtstr[0] = (*s)[0]; + fmtstr[1] = (*s)[1]; + fmtstr[2] = (*s)[2]; + fmtstr[3] = 0; + sscanf(fmtstr, "x%2x", &i); + c = i; + *s += 3; break; case '0': case '1': case '2': @@ -210,7 +276,11 @@ unsigned char zebra_prim(char **s) case '7': case '8': case '9': - sscanf(*s, "%3o", &i); + fmtstr[0] = (*s)[0]; + fmtstr[1] = (*s)[1]; + fmtstr[2] = (*s)[2]; + fmtstr[3] = 0; + sscanf(fmtstr, "%3o", &i); c = i; *s += 3; break; @@ -298,27 +368,84 @@ static void fun_add_qmap(const char *s, void *data, int num) logf (LOG_DEBUG, " %3d", (unsigned char) *s); } +static int scan_to_utf8 (iconv_t t, ucs4_t *from, size_t inlen, + char *outbuf, size_t outbytesleft) +{ + size_t inbytesleft = inlen * sizeof(ucs4_t); + char *inbuf = (char*) from; + size_t ret; + + if (t == (iconv_t)(-1)) + *outbuf++ = *from; /* ISO-8859-1 is OK here */ + else + { + size_t i; + for (i = 0; i 33 && arg[j] < 127) ? arg[j] : '?'); + if (s[0] == 0xfeff || s[0] == 0xfeff) /* skip byte Order Mark */ + s++; while (*s) { switch (*s) { case '{': s++; - begin = zebra_prim(&s); + begin = zebra_prim_w(&s); if (*s != '-') { logf(LOG_FATAL, "Bad range in char-map"); return -1; } s++; - end = zebra_prim(&s); + end = zebra_prim_w(&s); if (end <= begin) { logf(LOG_FATAL, "Bad range in char-map"); @@ -327,31 +454,28 @@ static int scan_string(char *s, s++; for (c = begin; c <= end; c++) { - str[0] = c; str[1] = '\0'; - (*fun)((char *) str, data, num ? (*num)++ : 0); + if (scan_to_utf8 (t_utf8, &c, 1, str, sizeof(str)-1)) + return -1; + (*fun)(str, data, num ? (*num)++ : 0); } break; case '[': s++; abort(); break; case '(': - p = (unsigned char*) ++s; - /* Find the end-marker, ignoring escapes */ - do - { - if (!(p = (unsigned char*) strchr((char*) p, ')'))) - { - logf(LOG_FATAL, "Missing ')' in string"); - return -1; - } - } - while (*(p - 1) == '\\'); - *p = 0; - (*fun)(s, data, num ? (*num)++ : 0); - s = (char*) p + 1; + ++s; + s0 = s; + while (*s != ')' || s[-1] == '\\') + s++; + *s = 0; + if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1)) + return -1; + (*fun)(str, data, num ? (*num)++ : 0); + s++; break; default: - c = zebra_prim(&s); - str[0] = c; str[1] = '\0'; - (*fun)((char *) str, data, num ? (*num)++ : 0); + c = zebra_prim_w(&s); + if (scan_to_utf8 (t_utf8, &c, 1, str, sizeof(str)-1)) + return -1; + (*fun)(str, data, num ? (*num)++ : 0); } } return 0; @@ -367,7 +491,17 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, int errors = 0; int argc, num = (int) *CHR_BASE, i; NMEM nmem; + iconv_t t_unicode = (iconv_t)(-1); + iconv_t t_utf8 = (iconv_t)(-1); + unsigned endian = 31; + const char *ucs4_native = "UCS-4"; + + if (*(char*) &endian == 31) /* little endian? */ + ucs4_native = "UCS-4LE"; +#if HAVE_ICONV_H + t_utf8 = iconv_open ("UTF-8", ucs4_native); +#endif logf (LOG_DEBUG, "maptab %s open", name); if (!(f = yaz_fopen(tabpath, name, "r", tabroot))) { @@ -421,7 +555,8 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, logf(LOG_FATAL, "Syntax error in charmap"); ++errors; } - if (scan_string(argv[1], fun_addentry, res, &num) < 0) + if (scan_string(argv[1], t_unicode, t_utf8, fun_addentry, + res, &num) < 0) { logf(LOG_FATAL, "Bad value-set specification"); ++errors; @@ -443,7 +578,8 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, logf(LOG_FATAL, "Missing arg for uppercase directive"); ++errors; } - if (scan_string(argv[1], fun_addentry, res, &num) < 0) + if (scan_string(argv[1], t_unicode, t_utf8, fun_addentry, + res, &num) < 0) { logf(LOG_FATAL, "Bad value-set specification"); ++errors; @@ -456,7 +592,8 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, logf(LOG_FATAL, "Syntax error in charmap"); ++errors; } - if (scan_string(argv[1], fun_addspace, res, 0) < 0) + if (scan_string(argv[1], t_unicode, t_utf8, + fun_addspace, res, 0) < 0) { logf(LOG_FATAL, "Bad space specification"); ++errors; @@ -473,12 +610,14 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, } buf.map = res; buf.string[0] = '\0'; - if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0) + if (scan_string(argv[2], t_unicode, t_utf8, + fun_mkstring, &buf, 0) < 0) { logf(LOG_FATAL, "Bad map target"); ++errors; } - if (scan_string(argv[1], fun_add_map, &buf, 0) < 0) + if (scan_string(argv[1], t_unicode, t_utf8, + fun_add_map, &buf, 0) < 0) { logf(LOG_FATAL, "Bad map source"); ++errors; @@ -495,17 +634,29 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, } buf.map = res; buf.string[0] = '\0'; - if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0) + if (scan_string(argv[2], t_unicode, t_utf8, + fun_mkstring, &buf, 0) < 0) { logf(LOG_FATAL, "Bad qmap target"); ++errors; } - if (scan_string(argv[1], fun_add_qmap, &buf, 0) < 0) + if (scan_string(argv[1], t_unicode, t_utf8, + fun_add_qmap, &buf, 0) < 0) { logf(LOG_FATAL, "Bad qmap source"); ++errors; } } + else if (!yaz_matchstr(argv[0], "encoding")) + { +#if HAVE_ICONV_H + if (t_unicode != (iconv_t)(-1)) + iconv_close (t_unicode); + t_unicode = iconv_open (ucs4_native, argv[1]); +#else + logf (LOG_WARN, "Encoding ignored. iconv not installed"); +#endif + } else { logf(LOG_WARN, "Syntax error at '%s' in %s", line, name); @@ -518,6 +669,12 @@ chrmaptab chrmaptab_create(const char *tabpath, const char *name, int map_only, res = 0; } logf (LOG_DEBUG, "maptab %s close %d errors", name, errors); +#if HAVE_ICONV_H + if (t_utf8 != (iconv_t)(-1)) + iconv_close(t_utf8); + if (t_unicode != (iconv_t)(-1)) + iconv_close(t_unicode); +#endif return res; } -- 1.7.10.4