+Implemented the 'equivalent' directive for .chr-files.
+
Added 'melm' directive to absyn format to simplify config files
for MARC-style databases. See tab/marc21.abs for an example.
-/* $Id: charmap.h,v 1.11 2005-01-15 19:38:24 adam Exp $
+/* $Id: charmap.h,v 1.12 2005-03-11 17:56:32 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len, int first);
YAZ_EXPORT const char **chr_map_input_x(chrmaptab t,
const char **from, int *len, int first);
-YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab,
- const char **from, int len,
- const char **qmap);
+YAZ_EXPORT const char **chr_map_q_input(chrmaptab maptab,
+ const char **from, int len, int first);
YAZ_EXPORT const char *chr_map_output(chrmaptab t, const char **from, int len);
-/* $Id: zebramap.h,v 1.3 2005-01-15 19:38:24 adam Exp $
+/* $Id: zebramap.h,v 1.4 2005-03-11 17:56:33 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
const char **from, int len, int first);
YAZ_EXPORT
+const char **zebra_maps_search (ZebraMaps zms, unsigned reg_id,
+ const char **from, int len, int *q_map_match);
+
+YAZ_EXPORT
const char *zebra_maps_output(ZebraMaps, unsigned reg_id, const char **from);
YAZ_EXPORT
-/* $Id: zrpn.c,v 1.170 2005-03-05 09:19:15 adam Exp $
+/* $Id: zrpn.c,v 1.171 2005-03-11 17:56:34 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
return *s0;
}
+
+static void esc_str(char *out_buf, int out_size,
+ const char *in_buf, int in_size)
+{
+ int k;
+
+ assert(out_buf);
+ assert(in_buf);
+ assert(out_size > 20);
+ *out_buf = '\0';
+ for (k = 0; k<in_size; k++)
+ {
+ int c = in_buf[k] & 0xff;
+ int pc;
+ if (c < 32 || c > 126)
+ pc = '?';
+ else
+ pc = c;
+ sprintf(out_buf +strlen(out_buf), "%02X:%c ", c, pc);
+ if (strlen(out_buf) > out_size-20)
+ {
+ strcat(out_buf, "..");
+ break;
+ }
+ }
+}
+
#define REGEX_CHARS " []()|.*+?!"
/* term_100: handle term, where trunc = none(no operators at all) */
const char **src, char *dst, int space_split,
char *dst_term)
{
- const char *s0, *s1;
+ const char *s0;
const char **map;
int i = 0;
int j = 0;
s0 = *src;
while (*s0)
{
- s1 = s0;
- map = zebra_maps_input(zebra_maps, reg_type, &s0, strlen(s0), 0);
+ const char *s1 = s0;
+ int q_map_match = 0;
+ map = zebra_maps_search(zebra_maps, reg_type, &s0, strlen(s0),
+ &q_map_match);
if (space_split)
{
if (**map == *CHR_SPACE)
space_start = space_end = 0;
}
}
- /* add non-space char */
- while (s1 < s0)
- {
- if (strchr(REGEX_CHARS, *s1))
- dst[i++] = '\\';
- dst_term[j++] = *s1;
- dst[i++] = *s1++;
- }
+ /* add non-space char */
+ memcpy(dst_term+j, s1, s0 - s1);
+ j += (s0 - s1);
+ if (!q_map_match)
+ {
+ while (s1 < s0)
+ {
+ if (strchr(REGEX_CHARS, *s1))
+ dst[i++] = '\\';
+ dst[i++] = *s1++;
+ }
+ }
+ else
+ {
+ char tmpbuf[80];
+ esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0]));
+
+ strcpy(dst + i, map[0]);
+ i += strlen(map[0]);
+ }
}
dst[i] = '\0';
dst_term[j] = '\0';
const char **src, char *dst, int space_split,
char *dst_term)
{
- const char *s0, *s1;
+ const char *s0;
const char **map;
int i = 0;
int j = 0;
}
else
{
- s1 = s0;
- map = zebra_maps_input(zebra_maps, reg_type, &s0, strlen(s0), 0);
+ const char *s1 = s0;
+ int q_map_match = 0;
+ map = zebra_maps_search(zebra_maps, reg_type, &s0, strlen(s0),
+ &q_map_match);
if (space_split && **map == *CHR_SPACE)
break;
- while (s1 < s0)
- {
- if (strchr(REGEX_CHARS, *s1))
- dst[i++] = '\\';
- dst_term[j++] = *s1;
- dst[i++] = *s1++;
- }
+
+ /* add non-space char */
+ memcpy(dst_term+j, s1, s0 - s1);
+ j += (s0 - s1);
+ if (!q_map_match)
+ {
+ while (s1 < s0)
+ {
+ if (strchr(REGEX_CHARS, *s1))
+ dst[i++] = '\\';
+ dst[i++] = *s1++;
+ }
+ }
+ else
+ {
+ char tmpbuf[80];
+ esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0]));
+
+ strcpy(dst + i, map[0]);
+ i += strlen(map[0]);
+ }
}
}
dst[i] = '\0';
{
int i = 0;
int j = 0;
- const char *s0, *s1;
+ const char *s0;
const char **map;
if (!term_pre(zebra_maps, reg_type, src, "^\\()[].*+?|", "(", !space_split))
}
else
{
- s1 = s0;
- map = zebra_maps_input(zebra_maps, reg_type, &s0, strlen(s0), 0);
- if (**map == *CHR_SPACE)
+ const char *s1 = s0;
+ int q_map_match = 0;
+ map = zebra_maps_search(zebra_maps, reg_type, &s0, strlen(s0),
+ &q_map_match);
+ if (space_split && **map == *CHR_SPACE)
break;
- while (s1 < s0)
- {
- if (strchr(REGEX_CHARS, *s1))
- dst[i++] = '\\';
- dst_term[j++] = *s1;
- dst[i++] = *s1++;
- }
+
+ /* add non-space char */
+ memcpy(dst_term+j, s1, s0 - s1);
+ j += (s0 - s1);
+ if (!q_map_match)
+ {
+ while (s1 < s0)
+ {
+ if (strchr(REGEX_CHARS, *s1))
+ dst[i++] = '\\';
+ dst[i++] = *s1++;
+ }
+ }
+ else
+ {
+ char tmpbuf[80];
+ esc_str(tmpbuf, sizeof(tmpbuf), map[0], strlen(map[0]));
+
+ strcpy(dst + i, map[0]);
+ i += strlen(map[0]);
+ }
}
}
dst[i] = '\0';
dst_term[j] = '\0';
*src = s0;
+
return i;
}
}
if (attr_ok)
{
+ char buf[80];
+ const char *input = term_dict + prefix_len;
+ esc_str(buf, sizeof(buf), input, strlen(input));
+ }
+ if (attr_ok)
+ {
yaz_log(log_level_rpn, "dict_lookup_grep: %s", term_dict+prefix_len);
r = dict_lookup_grep(zh->reg->dict, term_dict, regex_range,
grep_info, &max_pos, init_pos,
grep_info.isam_p_indx = 0;
r = dict_lookup_grep(zh->reg->dict, term_dict, 0,
&grep_info, &max_pos, 0, grep_handle);
- yaz_log (YLOG_LOG, "%s %d positions", term,
+ yaz_log (YLOG_DEBUG, "%s %d positions", term,
grep_info.isam_p_indx);
rset = rset_trunc(zh, grep_info.isam_p_buf,
grep_info.isam_p_indx, term, strlen(term),
-/* $Id: charmap1.c,v 1.3 2005-01-15 19:38:35 adam Exp $
+/* $Id: charmap1.c,v 1.4 2005-03-11 17:56:36 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
do_query(__LINE__, zh, "@term string ḥ", 1);
+ /* search for UNICODE A ring */
+ do_query(__LINE__, zh, "@term string lås", 1);
+
+ /* search for aa */
+ do_query(__LINE__, zh, "@term string laas", 1);
+
+ /* search for aa (regular) */
+ do_query(__LINE__, zh, "@attr 5=102 @term string lås", 1);
+
+ /* search for aaa */
+ do_query(__LINE__, zh, "@term string laaas", 0);
+
return close_down(zh, zs, 0);
}
-# $Id: string.utf8.chr,v 1.1 2004-03-09 15:12:15 adam Exp $
+# $Id: string.utf8.chr,v 1.2 2005-03-11 17:56:36 adam Exp $
# Define the basic value-set. *Beware* of changing this without re-indexing
# your databases.
# Characters to be considered equivalent for searching purposes.
-# equivalent æä(ae)
-# equivalent øö(oe)
-# equivalent å(aa)
-# equivalent uü
+equivalent æä(ae)
+equivalent øö(oe)
+equivalent å(aa)
+equivalent uü
# Supplemental mappings
map \L1E25 h
# Latin letter with H with dot below
map \L1E24 h
+
+
+
<gils>
<Title>
h æ
+ laas
<Acronym>
UUCCSEIS
</Acronym>
-/* $Id: charmap.c,v 1.35 2005-01-16 23:14:58 adam Exp $
+/* $Id: charmap.c,v 1.36 2005-03-11 17:56:36 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
} chrwork;
/*
+ * Callback for equivalent stuff
+ */
+typedef struct
+{
+ NMEM nmem;
+ int no_eq;
+ char *eq[CHR_MAXEQUIV];
+} chr_equiv_work;
+/*
* Add an entry to the character map.
*/
static chr_t_entry *set_map_string(chr_t_entry *root, NMEM nmem,
return (const char **) (res->target);
}
+const char **chr_map_q_input(chrmaptab maptab,
+ const char **from, int len, int first)
+{
+ chr_t_entry *t = maptab->q_input;
+ chr_t_entry *res;
+ int len_tmp[2];
+
+ len_tmp[0] = len;
+ len_tmp[1] = -1;
+ if (!(res = find_entry_x(t, from, len_tmp, first)))
+ return 0;
+ return (const char **) (res->target);
+}
+
const char *chr_map_output(chrmaptab maptab, const char **from, int len)
{
unsigned char c = ** (unsigned char **) from;
}
/*
+ * Create an unmodified string (scan_string handler).
+ */
+static void fun_add_equivalent_string(const char *s, void *data, int num)
+{
+ chr_equiv_work *arg = (chr_equiv_work *) data;
+
+ if (arg->no_eq == CHR_MAXEQUIV)
+ return;
+ arg->eq[arg->no_eq++] = nmem_strdup(arg->nmem, s);
+}
+
+/*
* Add a map to the string contained in the argument.
*/
static void fun_add_map(const char *s, void *data, int num)
yaz_log (YLOG_DEBUG, " %3d", (unsigned char) *s);
}
-/*
- * Add a query map to the string contained in the argument.
- */
-static void fun_add_qmap(const char *s, void *data, int num)
-{
- chrwork *arg = (chrwork *) data;
-
- assert(arg->map->q_input);
- yaz_log (YLOG_DEBUG, "set qmap %.*s", (int) strlen(s), s);
- set_map_string(arg->map->q_input, arg->map->nmem, s,
- strlen(s), arg->string, 0);
- for (s = arg->string; *s; s++)
- yaz_log (YLOG_DEBUG, " %3d", (unsigned char) *s);
-}
-
static int scan_to_utf8 (yaz_iconv_t t, ucs4_t *from, size_t inlen,
char *outbuf, size_t outbytesleft)
{
++errors;
}
}
- else if (!yaz_matchstr(argv[0], "qmap"))
+ else if (!yaz_matchstr(argv[0], "equivalent"))
{
- chrwork buf;
+ chr_equiv_work w;
- if (argc != 3)
+ if (argc != 2)
{
- yaz_log(YLOG_FATAL, "charmap directive qmap requires 2 args");
+ yaz_log(YLOG_FATAL, "equivalent requires 1 argument");
++errors;
}
- buf.map = res;
- buf.string[0] = '\0';
- if (scan_string(argv[2], t_unicode, t_utf8,
- fun_mkstring, &buf, 0) < 0)
+ w.nmem = res->nmem;
+ w.no_eq = 0;
+ if (scan_string(argv[1], t_unicode, t_utf8,
+ fun_add_equivalent_string, &w, 0) < 0)
{
- yaz_log(YLOG_FATAL, "Bad qmap target");
+ yaz_log(YLOG_FATAL, "equivalent: invalid string");
++errors;
}
- if (scan_string(argv[1], t_unicode, t_utf8,
- fun_add_qmap, &buf, 0) < 0)
+ else if (w.no_eq == 0)
{
- yaz_log(YLOG_FATAL, "Bad qmap source");
+ yaz_log(YLOG_FATAL, "equivalent: no strings");
++errors;
}
+ else
+ {
+ char *result_str;
+ int i, slen = 5;
+
+ /* determine length of regular expression */
+ for (i = 0; i<w.no_eq; i++)
+ slen += strlen(w.eq[i]) + 1;
+ result_str = nmem_malloc(res->nmem, slen + 5);
+
+ /* build the regular expression */
+ *result_str = '\0';
+ slen = 0;
+ for (i = 0; i<w.no_eq; i++)
+ {
+ result_str[slen++] = i ? '|' : '(';
+ strcpy(result_str + slen, w.eq[i]);
+ slen += strlen(w.eq[i]);
+ }
+ result_str[slen++] = ')';
+ result_str[slen] = '\0';
+
+ /* each eq will map to this regular expression */
+ for (i = 0; i<w.no_eq; i++)
+ {
+ set_map_string(res->q_input, res->nmem,
+ w.eq[i], strlen(w.eq[i]),
+ result_str, 0);
+ }
+ }
}
else if (!yaz_matchstr(argv[0], "encoding"))
{
-/* $Id: zebramap.c,v 1.39 2005-01-16 23:14:58 adam Exp $
+/* $Id: zebramap.c,v 1.40 2005-03-11 17:56:36 adam Exp $
Copyright (C) 1995-2005
Index Data ApS
return zms->temp_map_ptr;
}
+const char **zebra_maps_search(ZebraMaps zms, unsigned reg_id,
+ const char **from, int len, int *q_map_match)
+{
+ chrmaptab maptab;
+
+ *q_map_match = 0;
+ maptab = zebra_charmap_get (zms, reg_id);
+ if (maptab)
+ {
+ const char **map;
+ map = chr_map_q_input(maptab, from, len, 0);
+ if (map && map[0])
+ {
+ *q_map_match = 1;
+ return map;
+ }
+ map = chr_map_input(maptab, from, len, 0);
+ if (map)
+ return map;
+ }
+ zms->temp_map_str[0] = **from;
+
+ (*from)++;
+ return zms->temp_map_ptr;
+}
+
const char *zebra_maps_output(ZebraMaps zms, unsigned reg_id,
const char **from)
{