---- 1.2.0 2002/MM/DD
+--- 1.3.0 2002/MM/DD
+
+Zebra uses UTF-8 internally:
+
+1) New setting "encoding" for zebra.cfg that specifies encoding for
+OCTET terms in queries and record encoding for most transfer syntaxes
+(except those that use International Strings, such as GRS-1).
+
+2) The encoding of International strings is UTF-8 by default. It
+may be changed by character set negotiation. If character set
+negotiation is in effect and if records are selected for conversion
+these'll be converted to the selected character set - thus overriding
+the encoding setting in zebra.cfg.
+
+3) New directive "encoding" in .abs-files. This specifies the
+external for files indexed by zebra. However, if records themselves
+have an XML header that specifies and encoding that'll be used
+instead.
XML filter (-t grs.xml).
Zebra TODO
-$Id: TODO,v 1.11 2002-05-03 13:50:24 adam Exp $
+$Id: TODO,v 1.12 2002-07-25 13:06:43 adam Exp $
Make test scripts.
-Add XML parser (EXPAT).
-
-Ensure that UTF-8 indexing works and that EXPAT using
-other character set is converted to it.
-
Handle free lists for isamb (possibly others).
Check that recordId: actually works.
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
- * $Id: mfile.c,v 1.46 2002-07-16 09:52:20 heikki Exp $
+ * $Id: mfile.c,v 1.47 2002-07-25 13:06:43 adam Exp $
*/
dent->d_name);
return 0;
}
+#ifndef WIN32
fsync(fd);
+#endif
close(fd);
if (dirp->max_bytes >= 0)
dirp->avail_bytes -= part_f->bytes;
dnl Zebra, Index Data Aps, 1994-2002
-dnl $Id: configure.in,v 1.37 2002-07-03 10:04:34 adam Exp $
+dnl $Id: configure.in,v 1.38 2002-07-25 13:06:43 adam Exp $
dnl
AC_INIT(include/zebraver.h)
AC_MSG_CHECKING(for package)
AC_MSG_RESULT([Z'mbol])
PROGPREFIX=zmbol
AC_DEFINE(ZMBOL,1)
- AM_INIT_AUTOMAKE(zmbol,1.2.0)
+ AM_INIT_AUTOMAKE(zmbol,1.3.0)
else
AC_MSG_RESULT([Zebra])
PROGPREFIX=zebra
AC_DEFINE(ZMBOL,0)
- AM_INIT_AUTOMAKE(zebra,1.2.0)
+ AM_INIT_AUTOMAKE(zebra,1.3.0)
fi
AM_CONDITIONAL(ISZMBOL,test $PACKAGE = zmbol)
dnl ------ Substitutions
fi
dnl
dnl ------ times
-AC_CHECK_HEADERS(sys/times.h)
+AC_CHECK_HEADERS(sys/times.h iconv.h)
dnl ------ mkstemp
AC_CHECK_FUNCS(mkstemp)
dnl
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: recctrl.h,v $
- * Revision 1.36 2002-07-02 20:20:09 adam
+ * Revision 1.37 2002-07-25 13:06:43 adam
+ * Character set negotiation updates
+ *
+ * Revision 1.36 2002/07/02 20:20:09 adam
* idzebra:{filename,score,size,localnumber} tags for XML
*
* Revision 1.35 2002/04/13 18:16:42 adam
off_t (*tellf)(void *fh);
oid_value input_format; /* Preferred record syntax */
Z_RecordComposition *comp; /* formatting instructions */
+ char *encoding; /* preferred character encoding */
int localno; /* local id of record */
int score; /* score 0-1000 or -1 if none */
int recordSize; /* size of record in bytes */
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: rset.h,v $
- * Revision 1.17 2002-03-20 20:24:29 adam
+ * Revision 1.18 2002-07-25 13:06:43 adam
+ * Character set negotiation updates
+ *
+ * Revision 1.17 2002/03/20 20:24:29 adam
* Hits per term. Returned in SearchResult-1
*
* Revision 1.16 1999/02/02 14:50:38 adam
int nn;
char *flags;
int count;
+ int type;
};
typedef struct rset
int no_rset_terms;
} rset;
-RSET_TERM rset_term_create (const char *name, int length, const char *flags);
+RSET_TERM rset_term_create (const char *name, int length, const char *flags,
+ int type);
void rset_term_destroy (RSET_TERM t);
RSET_TERM rset_term_dup (RSET_TERM t);
* Copyright (C) 1994-2002, Index Data
* All rights reserved.
*
- * $Id: zebraver.h,v 1.20 2002-04-05 08:46:26 adam Exp $
+ * $Id: zebraver.h,v 1.21 2002-07-25 13:06:43 adam Exp $
*/
#ifndef ZEBRAVER
-#define ZEBRAVER "1.2.0"
+#define ZEBRAVER "1.3.0"
#endif
#ifndef ZEBRADATE
-#define ZEBRADATE "$Date: 2002-04-05 08:46:26 $"
+#define ZEBRADATE "$Date: 2002-07-25 13:06:43 $"
#endif
* Copyright (C) 1995-2002, Index Data
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss, Heikki Levanto
- * $Id: index.h,v 1.83 2002-04-26 08:44:47 adam Exp $
+ * $Id: index.h,v 1.84 2002-07-25 13:06:43 adam Exp $
*/
#ifndef INDEX_H
#include <sys/times.h>
#endif
+#if HAVE_ICONV_H
+#include <iconv.h>
+#endif
+
#include <dict.h>
#include <isams.h>
#include <isam.h>
int records_updated;
int records_deleted;
int records_processed;
-
+ char *record_encoding;
+#if HAVE_ICONV_H
+ iconv_t iconv_to_utf8;
+ iconv_t iconv_from_utf8;
+#endif
};
struct rank_control {
RSET rset_trunc (ZebraHandle zh, ISAMS_P *isam_p, int no,
const char *term, int length_term, const char *flags,
- int preserve_position);
+ int preserve_position, int term_type);
void resultSetAddTerm (ZebraHandle zh, ZebraSet s, int reg_type,
const char *db, int set,
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: retrieve.c,v $
- * Revision 1.18 2002-07-02 20:20:09 adam
+ * Revision 1.19 2002-07-25 13:06:43 adam
+ * Character set negotiation updates
+ *
+ * Revision 1.18 2002/07/02 20:20:09 adam
* idzebra:{filename,score,size,localnumber} tags for XML
*
* Revision 1.17 2002/05/03 13:49:04 adam
retrieveCtrl.odr = stream;
retrieveCtrl.input_format = retrieveCtrl.output_format = input_format;
retrieveCtrl.comp = comp;
+ retrieveCtrl.encoding = zh->record_encoding;
retrieveCtrl.diagnostic = 0;
retrieveCtrl.dh = zh->reg->dh;
retrieveCtrl.res = zh->res;
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss, Heikki Levanto
*
- * $Id: trunc.c,v 1.25 2002-07-12 18:12:22 heikki Exp $
+ * $Id: trunc.c,v 1.26 2002-07-25 13:06:43 adam Exp $
*
*/
#include <stdio.h>
static RSET rset_trunc_r (ZebraHandle zi, const char *term, int length,
const char *flags, ISAMS_P *isam_p, int from, int to,
- int merge_chunk, int preserve_position)
+ int merge_chunk, int preserve_position,
+ int term_type)
{
RSET result;
RSFD result_rsfd;
parms.cmp = key_compare_it;
parms.key_size = sizeof(struct it_key);
parms.temp_path = res_get (zi->res, "setTmpDir");
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags, term_type);
result = rset_create (rset_kind_temp, &parms);
result_rsfd = rset_open (result, RSETF_WRITE);
if (i_add <= to - i)
rset[rscur] = rset_trunc_r (zi, term, length, flags,
isam_p, i, i+i_add,
- merge_chunk, preserve_position);
+ merge_chunk, preserve_position,
+ term_type);
else
rset[rscur] = rset_trunc_r (zi, term, length, flags,
isam_p, i, to,
- merge_chunk, preserve_position);
+ merge_chunk, preserve_position,
+ term_type);
rscur++;
}
ti = heap_init (rscur, sizeof(struct it_key), key_compare_it);
RSET rset_trunc (ZebraHandle zi, ISAMS_P *isam_p, int no,
const char *term, int length, const char *flags,
- int preserve_position)
+ int preserve_position, int term_type)
{
logf (LOG_DEBUG, "rset_trunc no=%d", no);
if (no < 1)
{
rset_null_parms parms;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags, term_type);
return rset_create (rset_kind_null, &parms);
}
if (zi->reg->isams)
parms.pos = *isam_p;
parms.is = zi->reg->isams;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags,
+ term_type);
return rset_create (rset_kind_isams, &parms);
}
qsort (isam_p, no, sizeof(*isam_p), isams_trunc_cmp);
parms.pos = *isam_p;
parms.is = zi->reg->isam;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags,
+ term_type);
return rset_create (rset_kind_isam, &parms);
}
qsort (isam_p, no, sizeof(*isam_p), isam_trunc_cmp);
parms.cmp = key_compare_it;
parms.pos = *isam_p;
parms.is = zi->reg->isamc;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags,
+ term_type);
return rset_create (rset_kind_isamc, &parms);
}
#if NEW_TRUNC
parms.isam_positions = isam_p;
parms.no_isam_positions = no;
parms.no_save_positions = 100000;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags,
+ term_type);
return rset_create (rset_kind_m_or, &parms);
}
#endif
abort();
/* parms.pos = *isam_p; */
parms.is = zi->reg->isamd;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags,
+ term_type);
return rset_create (rset_kind_isamd, &parms);
}
#if NEW_TRUNC_NOT_DONE_FOR_ISAM_D
parms.cmp = key_compare_it;
parms.pos = *isam_p;
parms.is = zi->reg->isamb;
- parms.rset_term = rset_term_create (term, length, flags);
+ parms.rset_term = rset_term_create (term, length, flags,
+ term_type);
return rset_create (rset_kind_isamb, &parms);
}
qsort (isam_p, no, sizeof(*isam_p), isamd_trunc_cmp);
return rset_create (rset_kind_null, NULL);
}
return rset_trunc_r (zi, term, length, flags, isam_p, 0, no, 100,
- preserve_position);
+ preserve_position, term_type);
}
* Copyright (C) 1995-2002, Index Data
* All rights reserved.
*
- * $Id: zebraapi.c,v 1.62 2002-07-15 11:50:45 adam Exp $
+ * $Id: zebraapi.c,v 1.63 2002-07-25 13:06:43 adam Exp $
*/
#include <assert.h>
ZebraHandle zebra_open (ZebraService zs)
{
ZebraHandle zh;
+ const char *default_encoding;
if (!zs)
return 0;
zh->shadow_enable = 1;
+ default_encoding = res_get_def(zs->global_res, "encoding", "ISO-8859-1");
+ zh->record_encoding = xstrdup (default_encoding);
+#if HAVE_ICONV_H
+ zh->iconv_to_utf8 =
+ iconv_open ("UTF-8", default_encoding);
+ if (zh->iconv_to_utf8 == (iconv_t)(-1))
+ yaz_log (LOG_WARN, "iconv: %s to UTF-8 unsupported",
+ default_encoding);
+ zh->iconv_from_utf8 =
+ iconv_open (default_encoding, "UTF-8");
+ if (zh->iconv_to_utf8 == (iconv_t)(-1))
+ yaz_log (LOG_WARN, "iconv: UTF-8 to %s unsupported",
+ default_encoding);
+#endif
+
zebra_mutex_cond_lock (&zs->session_lock);
zh->next = zs->sessions;
zebra_register_close (zh->service, zh->reg);
zebra_close_res (zh);
+ xfree (zh->record_encoding);
+#if HAVE_ICONV_H
+ if (zh->iconv_to_utf8 != (iconv_t) (-1))
+ iconv_close (zh->iconv_to_utf8);
+ if (zh->iconv_from_utf8 != (iconv_t) (-1))
+ iconv_close (zh->iconv_from_utf8);
+#endif
+
xfree (zh->admin_databaseName);
zebra_mutex_cond_lock (&zs->session_lock);
zebra_lock_destroy (zh->lock_normal);
zh->shadow_enable = value;
}
+int zebra_record_encoding (ZebraHandle zh, const char *encoding)
+{
+ xfree (zh->record_encoding);
+ zh->record_encoding = xstrdup (encoding);
+ return 0;
+}
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
- * $Id: zebraapi.h,v 1.17 2002-07-11 13:03:01 heikki Exp $
+ * $Id: zebraapi.h,v 1.18 2002-07-25 13:06:43 adam Exp $
*/
#ifndef ZEBRAAPI_H
YAZ_EXPORT void zebra_result (ZebraHandle zh, int *code, char **addinfo);
-YAZ_EXPORT const char *zebra_resultSetTerms (ZebraHandle zh,
- const char *setname,
- int no, int *count, int *no_max);
+
+YAZ_EXPORT int zebra_resultSetTerms (ZebraHandle zh, const char *setname,
+ int no, int *count,
+ int *type, char *out, size_t *len);
YAZ_EXPORT void zebra_sort (ZebraHandle zh, ODR stream,
int num_input_setnames,
YAZ_EXPORT
void zebra_register_statistics (ZebraHandle zh, int dumpdict);
+YAZ_EXPORT
+int zebra_record_encoding (ZebraHandle zh, const char *encoding);
+
YAZ_END_CDECL
#endif
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
- * $Id: zrpn.c,v 1.116 2002-07-03 10:05:19 adam Exp $
+ * $Id: zrpn.c,v 1.117 2002-07-25 13:06:43 adam Exp $
*/
#include <stdio.h>
#include <assert.h>
return *s0;
}
+#define REGEX_CHARS "[]()|.*+!"
+
/* term_100: handle term, where trunc=none (no operators at all) */
static int term_100 (ZebraMaps zebra_maps, int reg_type,
const char **src, char *dst, int space_split,
{ /* reload last space */
while (space_start < space_end)
{
- if (!isalnum (*space_start) && *space_start != '-')
+ if (strchr (REGEX_CHARS, *space_start))
dst[i++] = '\\';
dst_term[j++] = *space_start;
dst[i++] = *space_start++;
/* add non-space char */
while (s1 < s0)
{
- if (!isalnum (*s1) && *s1 != '-')
+ if (strchr(REGEX_CHARS, *s1))
dst[i++] = '\\';
dst_term[j++] = *s1;
dst[i++] = *s1++;
break;
while (s1 < s0)
{
- if (!isalnum (*s1))
+ if (strchr(REGEX_CHARS, *s1))
dst[i++] = '\\';
dst_term[j++] = *s1;
dst[i++] = *s1++;
break;
while (s1 < s0)
{
- if (!isalnum (*s1))
+ if (strchr(REGEX_CHARS, *s1))
dst[i++] = '\\';
dst_term[j++] = *s1;
dst[i++] = *s1++;
break;
while (s1 < s0)
{
- if (!isalnum (*s1))
+ if (strchr(REGEX_CHARS, *s1))
dst[i++] = '\\';
dst_term[j++] = *s1;
dst[i++] = *s1++;
break;
while (s1 < s0)
{
- if (!isalnum (*s1))
+ if (strchr(REGEX_CHARS, *s1))
dst[i++] = '\\';
dst_term[j++] = *s1;
dst[i++] = *s1++;
logf (LOG_DEBUG, "term: %s", term_dst);
return rset_trunc (zh, grep_info->isam_p_buf,
grep_info->isam_p_indx, term_dst,
- strlen(term_dst), rank_type, 1 /* preserve pos */);
+ strlen(term_dst), rank_type, 1 /* preserve pos */,
+ zapt->term->which);
}
return 1;
}
-static void trans_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,
- char *termz)
+
+static int trans_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,
+ char *termz)
{
size_t sizez;
Z_Term *term = zapt->term;
- sizez = term->u.general->len;
- if (sizez > IT_MAX_WORD-1)
- sizez = IT_MAX_WORD-1;
- memcpy (termz, term->u.general->buf, sizez);
- termz[sizez] = '\0';
+ switch (term->which)
+ {
+ case Z_Term_general:
+#if HAVE_ICONV_H
+ if (zh->iconv_to_utf8 != (iconv_t)(-1))
+ {
+ char *inbuf = term->u.general->buf;
+ size_t inleft = term->u.general->len;
+ char *outbuf = termz;
+ size_t outleft = IT_MAX_WORD-1;
+ size_t ret;
+
+ yaz_log (LOG_DEBUG, "converting general from ISO-8859-1");
+ ret = iconv(zh->iconv_to_utf8, &inbuf, &inleft,
+ &outbuf, &outleft);
+ if (ret == (size_t)(-1))
+ {
+ ret = iconv(zh->iconv_to_utf8, 0, 0, 0, 0);
+ zh->errCode = 125;
+ return -1;
+ }
+ *outbuf = 0;
+ return 0;
+ }
+#endif
+ sizez = term->u.general->len;
+ if (sizez > IT_MAX_WORD-1)
+ sizez = IT_MAX_WORD-1;
+ memcpy (termz, term->u.general->buf, sizez);
+ termz[sizez] = '\0';
+ break;
+ case Z_Term_characterString:
+ sizez = strlen(term->u.characterString);
+ if (sizez > IT_MAX_WORD-1)
+ sizez = IT_MAX_WORD-1;
+ memcpy (termz, term->u.characterString, sizez);
+ termz[sizez] = '\0';
+ break;
+ default:
+ zh->errCode = 124;
+ }
+ return 0;
}
static void trans_scan_term (ZebraHandle zh, Z_AttributesPlusTerm *zapt,
int length_prox_term = 0;
int min_nn = 10000000;
int term_index;
+ int term_type = Z_Term_characterString;
const char *flags = NULL;
rsfd = (RSFD *) xmalloc (sizeof(*rsfd)*rset_no);
if (min_nn > rset[i]->rset_terms[j]->nn)
min_nn = rset[i]->rset_terms[j]->nn;
flags = nflags;
+ term_type = rset[i]->rset_terms[j]->type;
+
+ /* only if all term types are of type characterString .. */
+ /* the resulting term is of that type */
+ if (term_type != Z_Term_characterString)
+ term_type = Z_Term_general;
}
}
for (i = 0; i<rset_no; i++)
rset_null_parms parms;
parms.rset_term = rset_term_create (prox_term, length_prox_term,
- flags);
+ flags, term_type);
parms.rset_term->nn = 0;
result = rset_create (rset_kind_null, &parms);
}
RSFD rsfd_result;
parms.rset_term = rset_term_create (prox_term, length_prox_term,
- flags);
+ flags, term_type);
parms.rset_term->nn = min_nn;
parms.cmp = key_compare_it;
parms.key_size = sizeof (struct it_key);
logf (LOG_LOG, "generic prox, dist = %d, relation = %d, ordered =%d, exclusion=%d",
distance, relation, ordered, exclusion);
parms.rset_term = rset_term_create (prox_term, length_prox_term,
- flags);
+ flags, term_type);
parms.rset_term->nn = min_nn;
parms.cmp = key_compare_it;
parms.key_size = sizeof (struct it_key);
rset_null_parms parms;
parms.rset_term = rset_term_create (prox_term, length_prox_term,
- flags);
+ flags, term_type);
parms.rset_term->nn = 0;
result = rset_create (rset_kind_null, &parms);
}
{
rset_null_parms parms;
- parms.rset_term = rset_term_create (termz, -1, rank_type);
+ parms.rset_term = rset_term_create (termz, -1, rank_type,
+ zapt->term->which);
return rset_create (rset_kind_null, &parms);
}
else if (rset_no == 1)
{
rset_null_parms parms;
- parms.rset_term = rset_term_create (termz, -1, rank_type);
+ parms.rset_term = rset_term_create (termz, -1, rank_type,
+ zapt->term->which);
return rset_create (rset_kind_null, &parms);
}
result = rset[0];
{
rset_null_parms parms;
- parms.rset_term = rset_term_create (termz, -1, rank_type);
+ parms.rset_term = rset_term_create (termz, -1, rank_type,
+ zapt->term->which);
return rset_create (rset_kind_null, &parms);
}
result = rset[0];
rset[rset_no] = rset_trunc (zh, grep_info.isam_p_buf,
grep_info.isam_p_indx, term_dst,
strlen(term_dst), rank_type,
- 0 /* preserve position */);
+ 0 /* preserve position */,
+ zapt->term->which);
assert (rset[rset_no]);
if (++rset_no >= (int) (sizeof(rset)/sizeof(*rset)))
break;
{
rset_null_parms parms;
- parms.rset_term = rset_term_create (term_dst, -1, rank_type);
+ parms.rset_term = rset_term_create (term_dst, -1, rank_type,
+ zapt->term->which);
return rset_create (rset_kind_null, &parms);
}
result = rset[0];
struct it_key key;
rset_temp_parms parms;
- parms.rset_term = rset_term_create (termz, -1, rank_type);
+ parms.rset_term = rset_term_create (termz, -1, rank_type,
+ zapt->term->which);
parms.cmp = key_compare_it;
parms.key_size = sizeof (struct it_key);
parms.temp_path = res_get (zh->res, "setTmpDir");
sort_sequence->specs[i] = sks;
- parms.rset_term = rset_term_create (termz, -1, rank_type);
+ parms.rset_term = rset_term_create (termz, -1, rank_type,
+ zapt->term->which);
return rset_create (rset_kind_null, &parms);
}
rset_start_tag =
rset_trunc (zh, grep_info.isam_p_buf,
grep_info.isam_p_indx, use_string, strlen(use_string),
- rank_type, 1);
+ rank_type, 1, zapt->term->which);
prefix_len = 0;
ord = zebraExplain_lookupSU (zh->reg->zei, curAttributeSet, 2);
rset_end_tag =
rset_trunc (zh, grep_info.isam_p_buf,
grep_info.isam_p_indx, use_string, strlen(use_string),
- rank_type, 1);
+ rank_type, 1, zapt->term->which);
parms.key_size = sizeof(struct it_key);
parms.cmp = key_compare_it;
logf (LOG_DEBUG, "search_type=%s", search_type);
logf (LOG_DEBUG, "rank_type=%s", rank_type);
- if (zapt->term->which != Z_Term_general)
- {
- zh->errCode = 124;
- return NULL;
- }
- trans_term (zh, zapt, termz);
+ if (trans_term (zh, zapt, termz))
+ return 0;
if (sort_flag)
return rpn_sort_spec (zh, zapt, attributeSet, stream, sort_sequence,
&glist[i+before].term, mterm);
rset = rset_trunc (zh, &scan_info_array[j0].list[ptr[j0]].isam_p, 1,
glist[i+before].term, strlen(glist[i+before].term),
- NULL, 0);
+ NULL, 0, zapt->term->which);
ptr[j0]++;
for (j = j0+1; j<ord_no; j++)
rset2 =
rset_trunc (zh, &scan_info_array[j].list[ptr[j]].isam_p, 1,
glist[i+before].term,
- strlen(glist[i+before].term), NULL, 0);
+ strlen(glist[i+before].term), NULL, 0,
+ zapt->term->which);
bool_parms.key_size = sizeof(struct it_key);
bool_parms.cmp = key_compare_it;
rset = rset_trunc
(zh, &scan_info_array[j0].list[before-1-ptr[j0]].isam_p, 1,
glist[before-1-i].term, strlen(glist[before-1-i].term),
- NULL, 0);
+ NULL, 0, zapt->term->which);
ptr[j0]++;
rset2 = rset_trunc (zh,
&scan_info_array[j].list[before-1-ptr[j]].isam_p, 1,
glist[before-1-i].term,
- strlen(glist[before-1-i].term), NULL, 0);
+ strlen(glist[before-1-i].term), NULL, 0,
+ zapt->term->which);
bool_parms.key_size = sizeof(struct it_key);
bool_parms.cmp = key_compare_it;
* Copyright (C) 1995-2002, Index Data
* All rights reserved.
*
- * $Id: zserver.c,v 1.88 2002-05-07 11:05:19 adam Exp $
+ * $Id: zserver.c,v 1.89 2002-07-25 13:06:43 adam Exp $
*/
#include <stdio.h>
return r;
}
r->handle = zh;
+ if (q->charneg_request) /* characater set and langauge negotiation? */
+ {
+ char **charsets = 0;
+ int num_charsets;
+ char **langs = 0;
+ int num_langs = 0;
+ int selected = 0;
+ int i;
+
+ NMEM nmem = nmem_create ();
+ yaz_log (LOG_LOG, "character set and language negotiation");
+
+ yaz_get_proposal_charneg (nmem, q->charneg_request,
+ &charsets, &num_charsets,
+ &langs, &num_langs, &selected);
+ for (i = 0; i < num_charsets; i++)
+ {
+ yaz_log (LOG_LOG, "charset %d %s", i, charsets[i]);
+
+ if (odr_set_charset (q->decode, "UTF-8", charsets[i]) == 0)
+ {
+ odr_set_charset (q->stream, charsets[i], "UTF-8");
+ if (selected)
+ zebra_record_encoding (zh, charsets[i]);
+ q->charneg_response =
+ yaz_set_response_charneg (q->stream, charsets[i],
+ 0, selected);
+ break;
+ }
+ }
+ nmem_destroy (nmem);
+ }
return r;
}
int count;
int no_terms;
int i;
+ int type;
struct Z_External *ext;
Z_SearchInfoReport *sr;
/* get no of terms for result set */
- zebra_resultSetTerms (zh, r->setname, -1, &count, &no_terms);
+ no_terms = zebra_resultSetTerms (zh, r->setname, 0, 0, 0, 0, 0);
if (!no_terms)
return;
for (i = 0; i<no_terms; i++)
{
Z_Term *term;
- const char *termz = zebra_resultSetTerms (zh, r->setname, i,
- &count, &no_terms);
+ char outbuf[1024];
+ size_t len = sizeof(outbuf);
+ zebra_resultSetTerms (zh, r->setname, i,
+ &count, &type, outbuf, &len);
sr->elements[i] = odr_malloc (r->stream, sizeof(**sr->elements));
sr->elements[i]->subqueryId = 0;
odr_malloc (r->stream, sizeof(Z_QueryExpressionTerm));
term = odr_malloc (r->stream, sizeof(Z_Term));
sr->elements[i]->subqueryExpression->u.term->queryTerm = term;
-
- term->which = Z_Term_general;
- term->u.general = odr_malloc (r->stream, sizeof(Odr_oct));
- term->u.general->buf = odr_strdup (r->stream, termz);
-
- term->u.general->len = strlen (termz);
- term->u.general->size = strlen (termz);
-
+ switch (type)
+ {
+ case Z_Term_characterString:
+ yaz_log (LOG_LOG, "term as characterString");
+ term->which = Z_Term_characterString;
+ term->u.characterString = odr_strdup (r->stream, outbuf);
+ break;
+ case Z_Term_general:
+ yaz_log (LOG_LOG, "term as general");
+ term->which = Z_Term_general;
+ term->u.general = odr_malloc (r->stream, sizeof(*term->u.general));
+ term->u.general->size = term->u.general->len = len;
+ term->u.general->buf = odr_malloc (r->stream, len);
+ memcpy (term->u.general->buf, outbuf, len);
+ break;
+ default:
+ term->which = Z_Term_general;
+ term->u.null = odr_nullval();
+ }
sr->elements[i]->subqueryExpression->u.term->termComment = 0;
sr->elements[i]->subqueryInterpretation = 0;
sr->elements[i]->subqueryRecommendation = 0;
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
- * $Id: zserver.h,v 1.55 2002-04-04 14:14:13 adam Exp $
+ * $Id: zserver.h,v 1.56 2002-07-25 13:06:43 adam Exp $
*/
#include <yaz/backend.h>
+#include <yaz/charneg.h>
#include "zebraapi.h"
YAZ_BEGIN_CDECL
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
- * $Id: zsets.c,v 1.36 2002-04-18 20:22:09 adam Exp $
+ * $Id: zsets.c,v 1.37 2002-07-25 13:06:43 adam Exp $
*/
#include <stdio.h>
#include <assert.h>
}
-const char *zebra_resultSetTerms (ZebraHandle zh, const char *setname,
- int no, int *count, int *no_max)
+int zebra_resultSetTerms (ZebraHandle zh, const char *setname,
+ int no, int *count,
+ int *type, char *out, size_t *len)
{
ZebraSet s = resultSetGet (zh, setname);
+ int no_max = 0;
- *count = 0;
- *no_max = 0;
+ if (count)
+ *count = 0;
if (!s || !s->rset)
return 0;
- *no_max = s->rset->no_rset_terms;
- if (no < 0 || no >= *no_max)
+ no_max = s->rset->no_rset_terms;
+ if (no < 0 || no >= no_max)
return 0;
- *count = s->rset->rset_terms[no]->count;
- return s->rset->rset_terms[no]->name;
+ if (count)
+ *count = s->rset->rset_terms[no]->count;
+ if (type)
+ *type = s->rset->rset_terms[no]->type;
+
+ if (out)
+ {
+ char *inbuf = s->rset->rset_terms[no]->name;
+ size_t inleft = strlen(inbuf);
+ size_t outleft = *len - 1;
+ int converted = 0;
+#if HAVE_ICONV_H
+ if (zh->iconv_from_utf8 != (iconv_t)(-1))
+ {
+ char *outbuf = out;
+ size_t ret;
+
+ ret = iconv(zh->iconv_from_utf8, &inbuf, &inleft,
+ &outbuf, &outleft);
+ if (ret == (size_t)(-1))
+ *len = 0;
+ else
+ *len = outbuf - out;
+ converted = 1;
+ }
+#endif
+ if (!converted)
+ {
+ if (inleft > outleft)
+ inleft = outleft;
+ *len = inleft;
+ memcpy (out, inbuf, *len);
+ }
+ out[*len] = 0;
+ }
+ return no_max;
}
* Copyright (C) 1994-2002, Index Data
* All rights reserved.
*
- * $Id: recgrs.c,v 1.54 2002-07-05 16:07:02 adam Exp $
+ * $Id: recgrs.c,v 1.55 2002-07-25 13:06:44 adam Exp $
*/
#include <stdio.h>
if ((oid_ent_to_oid (&oe, oidtmp)))
(*p->schemaAdd)(p, oidtmp);
}
+
+ /* ensure our data1 tree is UTF-8 */
+ data1_iconv (p->dh, mem, n, "UTF-8", data1_get_encoding(p->dh, n));
+
#if 0
data1_pr_tree (p->dh, n, stdout);
#endif
- data1_iconv (p->dh, mem, n, "ISO-8859-1", "UTF-8");
(*p->init)(p, &wrd);
if (dumpkeys(n, p, 0, &wrd) < 0)
nmem_destroy (mem);
return 0;
}
+ /* ensure our data1 tree is UTF-8 */
+ data1_iconv (p->dh, mem, node, "UTF-8", data1_get_encoding(p->dh, node));
+
#if 0
data1_pr_tree (p->dh, node, stdout);
#endif
else if (p->comp && !res)
selected = 1;
-#if 0
- data1_pr_tree (p->dh, node, stdout);
-#endif
#if 1
- data1_iconv (p->dh, mem, node, "ISO-8859-1", "UTF-8");
+ data1_pr_tree (p->dh, node, stdout);
#endif
logf (LOG_DEBUG, "grs_retrieve: transfer syntax mapping");
switch (p->output_format = (p->input_format != VAL_NONE ?
p->input_format : VAL_SUTRS))
{
-
case VAL_TEXT_XML:
add_idzebra_info (p, top, mem);
+ if (p->encoding)
+ data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
+
if (!(p->rec_buf = data1_nodetoidsgml(p->dh, node, selected,
&p->rec_len)))
p->diagnostic = 238;
p->rec_len = (size_t) (-1);
break;
case VAL_SUTRS:
+ if (p->encoding)
+ data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
if (!(p->rec_buf = data1_nodetobuf(p->dh, node, selected,
&p->rec_len)))
p->diagnostic = 238;
p->diagnostic = 238;
break;
}
+ if (p->encoding)
+ data1_iconv (p->dh, mem, node, p->encoding, "UTF-8");
if (!(p->rec_buf = data1_nodetomarc(p->dh, marctab, node,
selected, &p->rec_len)))
p->diagnostic = 238;
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: rset.c,v $
- * Revision 1.15 2002-03-20 20:24:30 adam
+ * Revision 1.16 2002-07-25 13:06:44 adam
+ * Character set negotiation updates
+ *
+ * Revision 1.15 2002/03/20 20:24:30 adam
* Hits per term. Returned in SearchResult-1
*
* Revision 1.14 1999/05/26 07:49:14 adam
return rs->rset_terms;
}
-RSET_TERM rset_term_create (const char *name, int length, const char *flags)
+RSET_TERM rset_term_create (const char *name, int length, const char *flags,
+ int type)
{
RSET_TERM t = (RSET_TERM) xmalloc (sizeof(*t));
if (!name)
t->flags = xstrdup (flags);
t->nn = -1;
t->count = 0;
+ t->type = type;
return t;
}
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: rsnull.c,v $
- * Revision 1.13 2002-03-21 10:25:42 adam
+ * Revision 1.14 2002-07-25 13:06:44 adam
+ * Character set negotiation updates
+ *
+ * Revision 1.13 2002/03/21 10:25:42 adam
* use lockDir. Fixes for searchResult for null/sort sets
*
* Revision 1.12 1999/05/26 07:49:14 adam
if (parms && null_parms->rset_term)
ct->rset_terms[0] = null_parms->rset_term;
else
- ct->rset_terms[0] = rset_term_create ("term", -1, "rank-0");
+ ct->rset_terms[0] = rset_term_create ("term", -1, "rank-0",
+ 0);
ct->rset_terms[0]->nn = 0;
return NULL;
set output "times-b.ps"
set terminal postscript
-set title "ISAM-b Mon Jul 15 13:16:34 CEST 2002"
+set title "ISAM-b Mon Jul 15 14:06:44 CEST 2002"
set xlabel "runs"
set ylabel "seconds"
plot [0:] [0:] 'times-b.log' using 2 title 'real' with linespoints, 'times-b.log' using 3 title 'user' with linespoints, 'times-b.log' using 4 title 'sys' with linespoints
notimestamps: 1
-isam: b
+isam: null
register: reg-b:2G
# Simple Zebra configuration file
-# $Id: zebra.cfg,v 1.16 2002-05-07 11:04:37 adam Exp $
+# $Id: zebra.cfg,v 1.17 2002-07-25 13:06:44 adam Exp $
#
# Where the schema files, attribute files, etc are located.
profilePath: .:../../tab:../../../yaz/tab
#storekeys: 1
#storedata: 1
#recordId: (bib1,identifier-standard)
-isam: b
+isam: c
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
- * $Id: charmap.c,v 1.22 2002-05-03 13:46:05 adam Exp $
+ * $Id: charmap.c,v 1.23 2002-07-25 13:06:44 adam Exp $
*
*/
#include <string.h>
#include <assert.h>
+#if HAVE_ICONV_H
+#include <iconv.h>
+#else
+typedef int iconv_t;
+static size_t iconv(iconv_t t, char **buf, size_t *inbytesleft,
+ char **outbuf, size_t *outbytesleft)
+{
+ return -1;
+}
+#endif
+
+typedef unsigned ucs4_t;
+
#include <yaz/yaz-util.h>
#include <charmap.h>
+
#define CHR_MAXSTR 1024
#define CHR_MAXEQUIV 32
yaz_log (LOG_DEBUG, "prim %.3s", *s);
if (**s == '\\')
{
+ (*s)++;
+ c = **s;
+ switch (c)
+ {
+ case '\\': c = '\\'; (*s)++; break;
+ case 'r': c = '\r'; (*s)++; break;
+ case 'n': c = '\n'; (*s)++; break;
+ case 't': c = '\t'; (*s)++; break;
+ case 's': c = ' '; (*s)++; break;
+ case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ sscanf(*s, "%3o", &i);
+ c = i;
+ *s += 3;
+ break;
+ default:
+ (*s)++;
+ }
+ }
+ else
+ {
+ c = **s;
+ ++(*s);
+ }
+ return c;
+}
+
+ucs4_t zebra_prim_w(ucs4_t **s)
+{
+ ucs4_t c;
+ ucs4_t i = 0;
+ char fmtstr[8];
+
+ yaz_log (LOG_DEBUG, "prim %.3s", (char *) *s);
+ if (**s == '\\')
+ {
(*s)++;
c = **s;
switch (c)
case 'n': c = '\n'; (*s)++; break;
case 't': c = '\t'; (*s)++; break;
case 's': c = ' '; (*s)++; break;
- case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break;
+ case 'x':
+ fmtstr[0] = (*s)[0];
+ fmtstr[1] = (*s)[1];
+ fmtstr[2] = (*s)[2];
+ fmtstr[3] = 0;
+ sscanf(fmtstr, "x%2x", &i);
+ c = i;
+ *s += 3; break;
case '0':
case '1':
case '2':
case '7':
case '8':
case '9':
- sscanf(*s, "%3o", &i);
+ fmtstr[0] = (*s)[0];
+ fmtstr[1] = (*s)[1];
+ fmtstr[2] = (*s)[2];
+ fmtstr[3] = 0;
+ sscanf(fmtstr, "%3o", &i);
c = i;
*s += 3;
break;
logf (LOG_DEBUG, " %3d", (unsigned char) *s);
}
+static int scan_to_utf8 (iconv_t t, ucs4_t *from, size_t inlen,
+ char *outbuf, size_t outbytesleft)
+{
+ size_t inbytesleft = inlen * sizeof(ucs4_t);
+ char *inbuf = (char*) from;
+ size_t ret;
+
+ if (t == (iconv_t)(-1))
+ *outbuf++ = *from; /* ISO-8859-1 is OK here */
+ else
+ {
+ size_t i;
+ for (i = 0; i<inlen; i++)
+ yaz_log (LOG_LOG, "%08X", from[i]);
+ ret = iconv (t, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+ if (ret == (size_t) (-1))
+ {
+ yaz_log (LOG_WARN|LOG_ERRNO, "bad unicode sequence");
+ for (i = 0; i<inlen; i++)
+ yaz_log (LOG_LOG, "%08X", from[i]);
+ return -1;
+ }
+ }
+ *outbuf = '\0';
+ return 0;
+}
-static int scan_string(char *s,
+static int scan_string(char *s_native,
+ iconv_t t_unicode, iconv_t t_utf8,
void (*fun)(const char *c, void *data, int num),
void *data, int *num)
{
- unsigned char c, str[1024], begin, end, *p;
-
+ char str[1024];
+
+ ucs4_t arg[512];
+ ucs4_t *s0, *s = arg;
+ ucs4_t c, begin, end;
+ size_t i, j;
+
+ if (t_unicode != (iconv_t)(-1))
+ {
+ char *outbuf = (char *) arg;
+ char *inbuf = s_native;
+ size_t outbytesleft = sizeof(arg)-4;
+ size_t inbytesleft = strlen(s_native);
+ size_t ret;
+ ret = iconv(t_unicode, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+ if (ret == (size_t)(-1))
+ return -1;
+ i = (outbuf - (char*) arg)/sizeof(ucs4_t);
+ yaz_log (LOG_LOG, "to unicode");
+ }
+ else
+ {
+ for (i = 0; s_native[i]; i++)
+ arg[i] = s_native[i] & 255; /* ISO-8859-1 conversion */
+ yaz_log (LOG_LOG, "to virtual unicode");
+ }
+ arg[i] = 0; /* terminate */
+ for (j = 0; j<i; j++)
+ yaz_log (LOG_LOG, " %d %8X %d %c", j, arg[j], arg[j],
+ (arg[j] > 33 && arg[j] < 127) ? arg[j] : '?');
+ if (s[0] == 0xfeff || s[0] == 0xfeff) /* skip byte Order Mark */
+ s++;
while (*s)
{
switch (*s)
{
case '{':
s++;
- begin = zebra_prim(&s);
+ begin = zebra_prim_w(&s);
if (*s != '-')
{
logf(LOG_FATAL, "Bad range in char-map");
return -1;
}
s++;
- end = zebra_prim(&s);
+ end = zebra_prim_w(&s);
if (end <= begin)
{
logf(LOG_FATAL, "Bad range in char-map");
s++;
for (c = begin; c <= end; c++)
{
- str[0] = c; str[1] = '\0';
- (*fun)((char *) str, data, num ? (*num)++ : 0);
+ if (scan_to_utf8 (t_utf8, &c, 1, str, sizeof(str)-1))
+ return -1;
+ (*fun)(str, data, num ? (*num)++ : 0);
}
break;
case '[': s++; abort(); break;
case '(':
- p = (unsigned char*) ++s;
- /* Find the end-marker, ignoring escapes */
- do
- {
- if (!(p = (unsigned char*) strchr((char*) p, ')')))
- {
- logf(LOG_FATAL, "Missing ')' in string");
- return -1;
- }
- }
- while (*(p - 1) == '\\');
- *p = 0;
- (*fun)(s, data, num ? (*num)++ : 0);
- s = (char*) p + 1;
+ ++s;
+ s0 = s;
+ while (*s != ')' || s[-1] == '\\')
+ s++;
+ *s = 0;
+ if (scan_to_utf8 (t_utf8, s0, s - s0, str, sizeof(str)-1))
+ return -1;
+ (*fun)(str, data, num ? (*num)++ : 0);
+ s++;
break;
default:
- c = zebra_prim(&s);
- str[0] = c; str[1] = '\0';
- (*fun)((char *) str, data, num ? (*num)++ : 0);
+ c = zebra_prim_w(&s);
+ if (scan_to_utf8 (t_utf8, &c, 1, str, sizeof(str)-1))
+ return -1;
+ (*fun)(str, data, num ? (*num)++ : 0);
}
}
return 0;
int errors = 0;
int argc, num = (int) *CHR_BASE, i;
NMEM nmem;
+ iconv_t t_unicode = (iconv_t)(-1);
+ iconv_t t_utf8 = (iconv_t)(-1);
+ unsigned endian = 31;
+ const char *ucs4_native = "UCS-4";
+
+ if (*(char*) &endian == 31) /* little endian? */
+ ucs4_native = "UCS-4LE";
+#if HAVE_ICONV_H
+ t_utf8 = iconv_open ("UTF-8", ucs4_native);
+#endif
logf (LOG_DEBUG, "maptab %s open", name);
if (!(f = yaz_fopen(tabpath, name, "r", tabroot)))
{
logf(LOG_FATAL, "Syntax error in charmap");
++errors;
}
- if (scan_string(argv[1], fun_addentry, res, &num) < 0)
+ if (scan_string(argv[1], t_unicode, t_utf8, fun_addentry,
+ res, &num) < 0)
{
logf(LOG_FATAL, "Bad value-set specification");
++errors;
logf(LOG_FATAL, "Missing arg for uppercase directive");
++errors;
}
- if (scan_string(argv[1], fun_addentry, res, &num) < 0)
+ if (scan_string(argv[1], t_unicode, t_utf8, fun_addentry,
+ res, &num) < 0)
{
logf(LOG_FATAL, "Bad value-set specification");
++errors;
logf(LOG_FATAL, "Syntax error in charmap");
++errors;
}
- if (scan_string(argv[1], fun_addspace, res, 0) < 0)
+ if (scan_string(argv[1], t_unicode, t_utf8,
+ fun_addspace, res, 0) < 0)
{
logf(LOG_FATAL, "Bad space specification");
++errors;
}
buf.map = res;
buf.string[0] = '\0';
- if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0)
+ if (scan_string(argv[2], t_unicode, t_utf8,
+ fun_mkstring, &buf, 0) < 0)
{
logf(LOG_FATAL, "Bad map target");
++errors;
}
- if (scan_string(argv[1], fun_add_map, &buf, 0) < 0)
+ if (scan_string(argv[1], t_unicode, t_utf8,
+ fun_add_map, &buf, 0) < 0)
{
logf(LOG_FATAL, "Bad map source");
++errors;
}
buf.map = res;
buf.string[0] = '\0';
- if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0)
+ if (scan_string(argv[2], t_unicode, t_utf8,
+ fun_mkstring, &buf, 0) < 0)
{
logf(LOG_FATAL, "Bad qmap target");
++errors;
}
- if (scan_string(argv[1], fun_add_qmap, &buf, 0) < 0)
+ if (scan_string(argv[1], t_unicode, t_utf8,
+ fun_add_qmap, &buf, 0) < 0)
{
logf(LOG_FATAL, "Bad qmap source");
++errors;
}
}
+ else if (!yaz_matchstr(argv[0], "encoding"))
+ {
+#if HAVE_ICONV_H
+ if (t_unicode != (iconv_t)(-1))
+ iconv_close (t_unicode);
+ t_unicode = iconv_open (ucs4_native, argv[1]);
+#else
+ logf (LOG_WARN, "Encoding ignored. iconv not installed");
+#endif
+ }
else
{
logf(LOG_WARN, "Syntax error at '%s' in %s", line, name);
res = 0;
}
logf (LOG_DEBUG, "maptab %s close %d errors", name, errors);
+#if HAVE_ICONV_H
+ if (t_utf8 != (iconv_t)(-1))
+ iconv_close(t_utf8);
+ if (t_unicode != (iconv_t)(-1))
+ iconv_close(t_unicode);
+#endif
return res;
}