/*
- * Copyright (c) 1995-1997, Index Data.
+ * Copyright (c) 1995-1999, Index Data.
*
* All rights reserved.
*
* OF THIS SOFTWARE.
*
* $Log: charmap.h,v $
- * Revision 1.4 1997-10-27 14:33:04 adam
+ * Revision 1.5 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.4 1997/10/27 14:33:04 adam
* Moved towards generic character mapping depending on "structure"
* field in abstract syntax file. Fixed a few memory leaks. Fixed
* bug with negative integers when doing searches with relational
#ifndef CHARMAP_H
#define CHARMAP_H
+#include <yconfig.h>
+
#ifdef __cplusplus
extern "C" {
#endif
-extern const char *CHR_UNKNOWN;
-extern const char *CHR_SPACE;
-extern const char *CHR_BASE;
+YAZ_EXPORT extern const char *CHR_UNKNOWN;
+YAZ_EXPORT extern const char *CHR_SPACE;
+YAZ_EXPORT extern const char *CHR_BASE;
struct chr_t_entry;
typedef struct chr_t_entry chr_t_entry;
typedef struct chrmaptab_info *chrmaptab;
-chrmaptab chrmaptab_create(const char *tabpath, const char *name,
- int map_only);
-void chrmaptab_destroy (chrmaptab tab);
+YAZ_EXPORT chrmaptab chrmaptab_create(const char *tabpath, const char *name,
+ int map_only);
+YAZ_EXPORT void chrmaptab_destroy (chrmaptab tab);
-const char **chr_map_input(chrmaptab t, const char **from, int len);
+YAZ_EXPORT const char **chr_map_input(chrmaptab t, const char **from, int len);
+YAZ_EXPORT const char **chr_map_input_x(chrmaptab t,
+ const char **from, int *len);
+YAZ_EXPORT const char **chr_map_input_q(chrmaptab maptab,
+ const char **from, int len,
+ const char **qmap);
+
+YAZ_EXPORT const char *chr_map_output(chrmaptab t, const char **from, int len);
-const char *chr_map_output(chrmaptab t, const char **from, int len);
+YAZ_EXPORT unsigned char zebra_prim(char **s);
#ifdef __cplusplus
}
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: recctrl.h,v $
- * Revision 1.30 1999-05-21 12:00:17 adam
+ * Revision 1.31 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.30 1999/05/21 12:00:17 adam
* Better diagnostics for extraction process.
*
* Revision 1.29 1999/05/20 12:57:18 adam
int length;
int *seqnos;
ZebraMaps zebra_maps;
+ struct recExtractCtrl *extractCtrl;
} RecWord;
/* Extract record control */
off_t offset; /* start offset */
char *subType;
void (*init)(struct recExtractCtrl *p, RecWord *w);
- void (*addWord)(RecWord *p);
+ void *clientData;
+ void (*tokenAdd)(RecWord *w);
ZebraMaps zebra_maps;
int flagShowRecords;
int seqno[256];
- void (*addSchema)(struct recExtractCtrl *p, Odr_oid *oid);
+ void (*schemaAdd)(struct recExtractCtrl *p, Odr_oid *oid);
data1_handle dh;
};
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: zebramap.h,v $
- * Revision 1.8 1999-02-12 13:29:21 adam
+ * Revision 1.9 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.8 1999/02/12 13:29:21 adam
* Implemented position-flag for registers.
*
* Revision 1.7 1999/02/02 14:50:46 adam
int zebra_maps_is_complete (ZebraMaps zms, unsigned reg_id);
int zebra_maps_is_sort (ZebraMaps zms, unsigned reg_id);
int zebra_maps_is_positioned (ZebraMaps zms, unsigned reg_id);
+
+WRBUF zebra_replace(ZebraMaps zms, unsigned reg_id, const char *ex_list,
+ const char *input_str, int input_len);
+
#ifdef __cplusplus
}
#endif
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: extract.c,v $
- * Revision 1.97 1999-07-06 12:28:04 adam
+ * Revision 1.98 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.97 1999/07/06 12:28:04 adam
* Updated record index structure. Format includes version ID. Compression
* algorithm ID is stored for each record block.
*
w->attrSet = VAL_BIB1;
w->attrUse = 1016;
w->reg_type = 'w';
+ w->extractCtrl = p;
}
static struct sortKey {
static void addRecordKey (RecWord *p)
{
+ WRBUF wrbuf;
+ if ((wrbuf = zebra_replace(p->zebra_maps, p->reg_type, 0,
+ p->string, p->length)))
+ {
+ p->string = wrbuf_buf(wrbuf);
+ p->length = wrbuf_len(wrbuf);
+ }
if (zebra_maps_is_complete (p->zebra_maps, p->reg_type))
addCompleteField (p);
else
extractCtrl.fh = fi;
extractCtrl.subType = subType;
extractCtrl.init = wordInit;
- extractCtrl.addWord = addRecordKey;
- extractCtrl.addSchema = addSchema;
+ extractCtrl.tokenAdd = addRecordKey;
+ extractCtrl.schemaAdd = addSchema;
extractCtrl.dh = rGroup->dh;
for (i = 0; i<256; i++)
{
{
if (zebraExplain_newDatabase (zti, rGroup->databaseName,
rGroup->explainDatabase))
- abort ();
+ return 0;
}
if (rGroup->flagStoreData == -1)
reckeys.prevSeqNo = 0;
extractCtrl.init = wordInit;
- extractCtrl.addWord = addRecordKey;
- extractCtrl.addSchema = addSchema;
+ extractCtrl.tokenAdd = addRecordKey;
+ extractCtrl.schemaAdd = addSchema;
extractCtrl.dh = rGroup->dh;
for (i = 0; i<256; i++)
extractCtrl.seqno[i] = 0;
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: kdump.c,v $
- * Revision 1.17 1999-02-02 14:50:55 adam
+ * Revision 1.18 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.17 1999/02/02 14:50:55 adam
* Updated WIN32 code specific sections. Changed header.
*
* Revision 1.16 1998/05/20 10:12:17 adam
while (*from)
{
const char *res = zebra_maps_output (zm, reg_type, &from);
- while (*res)
- *(to++) = *(res++);
+ if (!res)
+ *to++ = *from++;
+ else
+ while (*res)
+ *to++ = *res++;
}
*to = '\0';
printf ("%c %3d %c %7d %5d %s\n", reg_type, usedb_type, op ? 'i':'d',
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: main.c,v $
- * Revision 1.69 1999-07-21 08:31:33 adam
+ * Revision 1.70 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.69 1999/07/21 08:31:33 adam
* More version info on WIN32.
*
* Revision 1.68 1999/07/14 10:59:26 adam
#endif
#if HAVE_BZLIB_H
fprintf (stderr, "libbzip2\n"
- " (C) 1996-1998 Julian R Seward. All rights reserved.\n");
+ " (C) 1996-1999 Julian R Seward. All rights reserved.\n");
#endif
}
else if (ret == 'v')
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: zrpn.c,v $
- * Revision 1.94 1999-07-20 13:59:18 adam
+ * Revision 1.95 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.94 1999/07/20 13:59:18 adam
* Fixed bug that occurred when phrases had 0 hits.
*
* Revision 1.93 1999/06/17 14:38:40 adam
while (*src)
{
const char *cp = zebra_maps_output (zh->zebra_maps, reg_type, &src);
- while (*cp)
- *dst++ = *cp++;
+ if (!cp)
+ *dst++ = *src++;
+ else
+ while (*cp)
+ *dst++ = *cp++;
}
*dst = '\0';
}
dst_term);
}
+
+/* term_104: handle term, where trunc=Process # and ! */
+static int term_104 (ZebraMaps zebra_maps, int reg_type,
+ const char **src, char *dst, int space_split,
+ char *dst_term)
+{
+ const char *s0, *s1;
+ const char **map;
+ int i = 0;
+ int j = 0;
+
+ if (!term_pre (zebra_maps, reg_type, src, "#!", "#!"))
+ return 0;
+ s0 = *src;
+ while (*s0)
+ {
+ if (*s0 == '#')
+ {
+ dst[i++] = '.';
+ dst[i++] = '*';
+ dst_term[j++] = *s0++;
+ }
+ else if (*s0 == '!')
+ {
+ dst[i++] = '.';
+ dst_term[j++] = *s0++;
+ }
+ {
+ s1 = s0;
+ map = zebra_maps_input (zebra_maps, reg_type, &s0, strlen(s0));
+ if (space_split && **map == *CHR_SPACE)
+ break;
+ while (s1 < s0)
+ {
+ if (!isalnum (*s1))
+ dst[i++] = '\\';
+ dst_term[j++] = *s1;
+ dst[i++] = *s1++;
+ }
+ }
+ }
+ dst[i] = '\0';
+ dst_term[j++] = '\0';
+ *src = s0;
+ return i;
+}
+
+
/* gen_regular_rel - generate regular expression from relation
* val: border value (inclusive)
* islt: 1 if <=; 0 if >=.
dst[dst_p] = '\0';
if (islt)
{
- for (i=1; i<pos; i++)
- strcat (dst, "[0-9]?");
+ /* match everything less than 10^(pos-1) */
+ strcat (dst, "0*");
+ for (i=1; i<pos; i++)
+ strcat (dst, "[0-9]?");
}
else
{
+ /* match everything greater than 10^pos */
for (i = 0; i <= pos; i++)
strcat (dst, "[0-9]");
strcat (dst, "[0-9]*");
logf (LOG_WARN, "dict_lookup_grep err, trunc=eregular: %d",
r);
break;
+ case 104: /* process # and ! in term */
+ term_dict[j++] = '(';
+ if (!term_104 (zh->zebra_maps, reg_type,
+ &termp, term_dict + j, space_split, term_dst))
+ return 0;
+ strcat (term_dict, ")");
+ r = dict_lookup_grep (zh->dict, term_dict, 0, grep_info,
+ &max_pos, 0, grep_handle);
+ if (r)
+ logf (LOG_WARN, "dict_lookup_grep err, trunc=#/!: %d", r);
+ break;
}
}
*term_sub = termp;
return result;
}
+
+char *normalize_term(ZebraHandle zh, Z_AttributesPlusTerm *zapt,
+ const char *termz, NMEM stream, unsigned reg_id)
+{
+ WRBUF wrbuf = 0;
+ AttrType truncation;
+ int truncation_value;
+ char *ex_list = 0;
+
+ attr_init (&truncation, zapt, 5);
+ truncation_value = attr_find (&truncation, NULL);
+
+ switch (truncation_value)
+ {
+ default:
+ ex_list = "";
+ break;
+ case 101:
+ ex_list = "#";
+ break;
+ case 102:
+ case 103:
+ ex_list = 0;
+ break;
+ case 104:
+ ex_list = "!#";
+ break;
+ }
+ if (ex_list)
+ wrbuf = zebra_replace(zh->zebra_maps, reg_id, ex_list,
+ termz, strlen(termz));
+ if (!wrbuf)
+ return nmem_strdup(stream, termz);
+ else
+ {
+ char *buf = (char*) nmem_malloc (stream, wrbuf_len(wrbuf)+1);
+ memcpy (buf, wrbuf_buf(wrbuf), wrbuf_len(wrbuf));
+ buf[wrbuf_len(wrbuf)] = '\0';
+ return buf;
+ }
+}
+
static RSET rpn_search_APT_phrase (ZebraHandle zh,
Z_AttributesPlusTerm *zapt,
- const char *termz,
+ const char *termz_org,
oid_value attributeSet,
NMEM stream,
int reg_type, int complete_flag,
int num_bases, char **basenames)
{
char term_dst[IT_MAX_WORD+1];
- const char *termp = termz;
RSET rset[60], result;
int i, r, rset_no = 0;
struct grep_info grep_info;
+ char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type);
+ const char *termp = termz;
#ifdef TERM_COUNT
grep_info.term_no = 0;
static RSET rpn_search_APT_or_list (ZebraHandle zh,
Z_AttributesPlusTerm *zapt,
- const char *termz,
+ const char *termz_org,
oid_value attributeSet,
NMEM stream,
int reg_type, int complete_flag,
int num_bases, char **basenames)
{
char term_dst[IT_MAX_WORD+1];
- const char *termp = termz;
RSET rset[60], result;
int i, r, rset_no = 0;
struct grep_info grep_info;
-
+ char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type);
+ const char *termp = termz;
#ifdef TERM_COUNT
grep_info.term_no = 0;
#endif
static RSET rpn_search_APT_and_list (ZebraHandle zh,
Z_AttributesPlusTerm *zapt,
- const char *termz,
+ const char *termz_org,
oid_value attributeSet,
NMEM stream,
int reg_type, int complete_flag,
int num_bases, char **basenames)
{
char term_dst[IT_MAX_WORD+1];
- const char *termp = termz;
RSET rset[60], result;
int i, r, rset_no = 0;
struct grep_info grep_info;
+ char *termz = normalize_term(zh, zapt, termz_org, stream, reg_type);
+ const char *termp = termz;
#ifdef TERM_COUNT
grep_info.term_no = 0;
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: recgrs.c,v $
- * Revision 1.31 1999-07-14 10:56:43 adam
+ * Revision 1.32 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.31 1999/07/14 10:56:43 adam
* Fixed potential memory leak.
*
* Revision 1.30 1999/07/06 12:26:41 adam
wrd.length = n->u.data.len;
wrd.attrSet = (int) (tlist->att->parent->reference);
wrd.attrUse = tlist->att->locals->local;
- (*p->addWord)(&wrd);
+ (*p->tokenAdd)(&wrd);
}
}
}
oe.value = n->u.root.absyn->reference;
if ((oid_ent_to_oid (&oe, oidtmp)))
- (*p->addSchema)(p, oidtmp);
+ (*p->schemaAdd)(p, oidtmp);
return dumpkeys(n, p, 0);
}
oe.oclass = CLASS_SCHEMA;
oe.value = n->u.root.absyn->reference;
if ((oid_ent_to_oid (&oe, oidtmp)))
- (*p->addSchema)(p, oidtmp);
+ (*p->schemaAdd)(p, oidtmp);
if (dumpkeys(n, p, 0) < 0)
{
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: rectext.c,v $
- * Revision 1.12 1999-05-26 07:49:14 adam
+ * Revision 1.13 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.12 1999/05/26 07:49:14 adam
* C++ compilation.
*
* Revision 1.11 1999/05/21 12:00:17 adam
{
recWord.string = w;
recWord.length = i;
- (*p->addWord)(&recWord);
+ (*p->tokenAdd)(&recWord);
}
} while (r > 0);
buf_close (fi);
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: regxread.c,v $
- * Revision 1.31 1999-07-14 13:05:29 adam
+ * Revision 1.32 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.31 1999/07/14 13:05:29 adam
* Tcl filter works with objects when TCL is version 8 or later; filter
* works with strings otherwise (slow).
*
static void tagBegin (struct lexSpec *spec,
const char *tag, int len)
{
- struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
+ struct data1_node *parent;
data1_element *elem = NULL;
- data1_node *partag = get_parent_tag(spec->dh, parent);
+ data1_node *partag;
data1_node *res;
data1_element *e = NULL;
int localtag = 0;
return ;
}
tagStrip (&tag, &len);
+
+ parent = spec->d1_stack[spec->d1_level -1];
+ partag = get_parent_tag(spec->dh, parent);
res = data1_mk_node (spec->dh, spec->m);
res->parent = parent;
--- /dev/null
+# Danish/Swedish character map.
+#
+# $Id: scan.chr,v 1.1 1999-09-07 07:19:21 adam Exp $
+
+# Define the basic value-set. *Beware* of changing this without re-indexing
+# your databases.
+
+lowercase {0-9}{a-y}üzæäøöå
+uppercase {0-9}{A-Y}ÜZÆÄØÖÅ
+
+# Breaking characters
+
+space {\001-\040}!"#$%&'\()*+,-./:;<=>?@\[\\]^_`\{|}~
+
+# Characters to be considered equivalent for searching purposes.
+
+# equivalent æä(ae)
+# equivalent øö(oe)
+# equivalent å(aa)
+# equivalent uü
+
+# Supplemental mappings
+
+map (ä) ä
+map (æ) æ
+map (ø) ø
+map (å) Ã¥
+map (ö) ö
+map (Ä) Ä
+map (&Aelig;) Æ
+map (Ø) Ø
+map (Å) Ã…
+map (Ö) Ö
+
+map éÉ e
+map á a
+map ó o
+map à i
+
+map (Aa) (AA)
+
+map (aa) a
+
+#qmap (ies) (ie)
-# Danish/Swedish character map.
+# Generic character map.
#
-# $Id: string.chr,v 1.3 1998-11-29 22:45:55 quinn Exp $
+# $Id: string.chr,v 1.4 1999-09-07 07:19:21 adam Exp $
# Define the basic value-set. *Beware* of changing this without re-indexing
# your databases.
# Supplemental mappings
-map (ä) ä
-map (æ) æ
-map (ø) ø
-map (å) Ã¥
-map (ö) ö
-map (Ä) Ä
-map (&Aelig;) Æ
-map (Ø) Ø
-map (Å) Ã…
-map (Ö) Ö
-
-map éÉ e
-map á a
-map ó o
-map à i
-
-map (Aa) (AA)
+#map (ä) ä
+#map (æ) æ
+#map (ø) ø
+#map (å) Ã¥
+#map (ö) ö
+#map (Ä) Ä
+#map (&Aelig;) Æ
+#map (Ø) Ø
+#map (Å) Ã…
+#map (Ö) Ö
+
+#map éÉ e
+#map á a
+#map ó o
+#map à i
+
+#map (Aa) (AA)
+
+#map (aa) a
/*
- * Copyright (C) 1996-1998, Index Data
+ * Copyright (C) 1996-1999, Index Data
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: charmap.c,v $
- * Revision 1.15 1999-05-26 07:49:14 adam
+ * Revision 1.16 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.15 1999/05/26 07:49:14 adam
* C++ compilation.
*
* Revision 1.14 1998/10/13 20:09:18 adam
#define CHR_MAXSTR 1024
#define CHR_MAXEQUIV 32
-int chr_map_chrs(chr_t_entry *t, char **from, int len,
- int *read, char **to, int max);
-
const char *CHR_UNKNOWN = "\001";
const char *CHR_SPACE = "\002";
const char *CHR_BASE = "\003";
struct chrmaptab_info
{
chr_t_entry *input; /* mapping table for input data */
- chr_t_entry *query_equiv; /* mapping table for queries */
+ chr_t_entry *q_input; /* mapping table for queries */
unsigned char *output[256]; /* return mapping - for display of registers */
int base_uppercase; /* Start of upper-case ordinals */
- char **tmp_buf;
NMEM nmem;
};
*/
struct chr_t_entry
{
- chr_t_entry **children; /* array of children */
- unsigned char *target; /* target for this node, if any */
- unsigned char *equiv; /* equivalent to, or sumthin */
+ chr_t_entry **children; /* array of children */
+ unsigned char **target; /* target for this node, if any */
};
/*
}
if (!len)
{
- if (!root->target || (char*) root->target == CHR_SPACE ||
- (char*) root->target == CHR_UNKNOWN)
- root->target = (unsigned char *) nmem_strdup(nmem, to);
- else if ((char*) to != CHR_SPACE)
- logf(LOG_DEBUG, "Character map overlap");
+ if (!root->target || !root->target[0] || strcmp(root->target[0], to))
+ {
+ root->target = (unsigned char **)
+ nmem_malloc(nmem, sizeof(*root->target)*2);
+ root->target[0] = (unsigned char *) nmem_strdup(nmem, to);
+ root->target[1] = 0;
+ }
}
else
{
return root;
}
-int chr_map_chrs(chr_t_entry *t, char **from, int len, int *read, char **to,
- int max)
-{
- int i = 0;
- unsigned char *s;
-
- while (len && t->children && t->children[(unsigned char) **from])
- {
- t = t->children[(unsigned char) **from];
- (*from)++;
- len--;
- }
- /* if there were no matches, we are still at the root node,
- which always has a null mapping */
- for (s = t->target; *s && max; s++)
- {
- **to = *s;
- s++;
- (*to)++;
- max--;
- i++;
- }
- return i;
-}
-
-
static chr_t_entry *find_entry(chr_t_entry *t, const char **from, int len)
{
chr_t_entry *res;
*from = pos;
}
/* no children match. use ourselves, if we have a target */
- return t->target ? t : 0;
+ return t->target ? t : 0;
+}
+
+static chr_t_entry *find_entry_x(chr_t_entry *t, const char **from, int *len)
+{
+ chr_t_entry *res;
+
+ while (*len <= 0)
+ { /* switch to next buffer */
+ if (*len < 0)
+ break;
+ from++;
+ len++;
+ }
+ if (*len > 0 && t->children && t->children[(unsigned char) **from])
+ {
+ const char *old_from = *from;
+ int old_len = *len;
+
+ (*len)--;
+ (*from)++;
+ if ((res = find_entry_x(t->children[(unsigned char) *old_from],
+ from, len)))
+ return res;
+ /* no match */
+ *len = old_len;
+ *from = old_from;
+ }
+ /* no children match. use ourselves, if we have a target */
+ return t->target ? t : 0;
+}
+
+const char **chr_map_input_x(chrmaptab maptab, const char **from, int *len)
+{
+ chr_t_entry *t = maptab->input;
+ chr_t_entry *res;
+
+ if (!(res = find_entry_x(t, from, len)))
+ abort();
+ return (const char **) (res->target);
}
const char **chr_map_input(chrmaptab maptab, const char **from, int len)
{
chr_t_entry *t = maptab->input;
chr_t_entry *res;
+ int len_tmp[2];
- if (!(res = find_entry(t, from, len)))
+ len_tmp[0] = len;
+ len_tmp[1] = -1;
+ if (!(res = find_entry_x(t, from, len_tmp)))
abort();
- maptab->tmp_buf[0] = (char*) res->target;
- maptab->tmp_buf[1] = NULL;
- return (const char **) maptab->tmp_buf;
+ return (const char **) (res->target);
}
const char *chr_map_output(chrmaptab maptab, const char **from, int len)
return (const char*) maptab->output[c];
}
-static unsigned char prim(char **s)
+unsigned char zebra_prim(char **s)
{
unsigned char c;
unsigned int i;
-
+
if (**s == '\\')
{
(*s)++;
c = **s;
switch (c)
{
- case '\\': c = '\\'; (*s)++; break;
- case 'r': c = '\r'; (*s)++; break;
- case 'n': c = '\n'; (*s)++; break;
- case 't': c = '\t'; (*s)++; break;
- case 's': c = ' '; (*s)++; break;
- case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break;
- case '{': case '[': case '(': case '}': case ']': case ')':
- (*s)++;
- break;
- default: sscanf(*s, "%3o", &i); c = i; *s += 3; break;
+ case '\\': c = '\\'; (*s)++; break;
+ case 'r': c = '\r'; (*s)++; break;
+ case 'n': c = '\n'; (*s)++; break;
+ case 't': c = '\t'; (*s)++; break;
+ case 's': c = ' '; (*s)++; break;
+ case 'x': sscanf(*s, "x%2x", &i); c = i; *s += 3; break;
+ case '{': case '[': case '(': case '}': case ']': case ')': case '$':
+ (*s)++;
+ break;
+ default:
+ sscanf(*s, "%3o", &i); c = i; *s += 3; break;
}
return c;
}
{
chrmaptab tab = (chrmaptab) data;
char tmp[2];
-
+
tmp[0] = num; tmp[1] = '\0';
tab->input = set_map_string(tab->input, tab->nmem, s, strlen(s), tmp);
tab->output[num + tab->base_uppercase] =
/*
* Add a map to the string contained in the argument.
*/
-static void fun_addmap(const char *s, void *data, int num)
+static void fun_add_map(const char *s, void *data, int num)
{
chrwork *arg = (chrwork *) data;
assert(arg->map->input);
+ logf (LOG_LOG, "set map %.*s", (int) strlen(s), s);
set_map_string(arg->map->input, arg->map->nmem, s, strlen(s), arg->string);
+ for (s = arg->string; *s; s++)
+ logf (LOG_LOG, " %3d", (unsigned char) *s);
+}
+
+/*
+ * Add a query map to the string contained in the argument.
+ */
+static void fun_add_qmap(const char *s, void *data, int num)
+{
+ chrwork *arg = (chrwork *) data;
+
+ assert(arg->map->q_input);
+ logf (LOG_LOG, "set qmap %.*s", (int) strlen(s), s);
+ set_map_string(arg->map->q_input, arg->map->nmem, s,
+ strlen(s), arg->string);
+ for (s = arg->string; *s; s++)
+ logf (LOG_LOG, " %3d", (unsigned char) *s);
}
+
static int scan_string(char *s,
void (*fun)(const char *c, void *data, int num),
void *data, int *num)
{
unsigned char c, str[1024], begin, end, *p;
-
+
while (*s)
{
switch (*s)
{
- case '{':
- s++;
- begin = prim(&s);
- if (*s != '-')
- {
- logf(LOG_FATAL, "Bad range in char-map");
- return -1;
- }
- s++;
- end = prim(&s);
- if (end <= begin)
- {
- logf(LOG_FATAL, "Bad range in char-map");
- return -1;
- }
- s++;
- for (c = begin; c <= end; c++)
- {
- str[0] = c; str[1] = '\0';
- (*fun)((char *) str, data, num ? (*num)++ : 0);
- }
- break;
- case '[': s++; abort(); break;
- case '(':
- p = (unsigned char*) ++s;
+ case '{':
+ s++;
+ begin = zebra_prim(&s);
+ if (*s != '-')
+ {
+ logf(LOG_FATAL, "Bad range in char-map");
+ return -1;
+ }
+ s++;
+ end = zebra_prim(&s);
+ if (end <= begin)
+ {
+ logf(LOG_FATAL, "Bad range in char-map");
+ return -1;
+ }
+ s++;
+ for (c = begin; c <= end; c++)
+ {
+ str[0] = c; str[1] = '\0';
+ (*fun)((char *) str, data, num ? (*num)++ : 0);
+ }
+ break;
+ case '[': s++; abort(); break;
+ case '(':
+ p = (unsigned char*) ++s;
/* Find the end-marker, ignoring escapes */
- do
+ do
+ {
+ if (!(p = (unsigned char*) strchr((char*) p, ')')))
{
- if (!(p = (unsigned char*) strchr((char*) p, ')')))
- {
- logf(LOG_FATAL, "Missing ')' in string");
- return -1;
- }
+ logf(LOG_FATAL, "Missing ')' in string");
+ return -1;
}
- while (*(p - 1) == '\\');
- *p = 0;
- (*fun)(s, data, num ? (*num)++ : 0);
- s = (char*) p + 1;
- break;
- default:
- c = prim(&s);
- str[0] = c; str[1] = '\0';
- (*fun)((char *) str, data, num ? (*num)++ : 0);
+ }
+ while (*(p - 1) == '\\');
+ *p = 0;
+ (*fun)(s, data, num ? (*num)++ : 0);
+ s = (char*) p + 1;
+ break;
+ default:
+ c = zebra_prim(&s);
+ str[0] = c; str[1] = '\0';
+ (*fun)((char *) str, data, num ? (*num)++ : 0);
}
}
return 0;
char line[512], *argv[50];
chrmaptab res;
int lineno = 0;
+ int errors = 0;
int argc, num = (int) *CHR_BASE, i;
+ NMEM nmem;
+ logf (LOG_LOG, "maptab %s open", name);
if (!(f = yaz_path_fopen(tabpath, name, "r")))
{
logf(LOG_WARN|LOG_ERRNO, "%s", name);
return 0;
}
- res = (chrmaptab) xmalloc(sizeof(*res));
- res->nmem = nmem_create ();
- res->tmp_buf = (char **)
- nmem_malloc (res->nmem, sizeof(*res->tmp_buf) * 100);
+ nmem = nmem_create ();
+ res = (chrmaptab) nmem_malloc(nmem, sizeof(*res));
+ res->nmem = nmem;
res->input = (chr_t_entry *) nmem_malloc(res->nmem, sizeof(*res->input));
- res->input->target = (unsigned char*) CHR_UNKNOWN;
- res->input->equiv = 0;
+ res->input->target = (unsigned char **)
+ nmem_malloc(res->nmem, sizeof(*res->input->target) * 2);
+ res->input->target[0] = (unsigned char*) CHR_UNKNOWN;
+ res->input->target[1] = 0;
res->input->children = (chr_t_entry **)
nmem_malloc(res->nmem, sizeof(res->input) * 256);
for (i = 0; i < 256; i++)
res->input->children[i] = (chr_t_entry *)
nmem_malloc(res->nmem, sizeof(*res->input));
res->input->children[i]->children = 0;
+ res->input->children[i]->target = (unsigned char **)
+ nmem_malloc (res->nmem, 2 * sizeof(unsigned char *));
+ res->input->children[i]->target[1] = 0;
if (map_only)
{
- res->input->children[i]->target = (unsigned char *)
- nmem_malloc (res->nmem, 2 * sizeof(char));
- res->input->children[i]->target[0] = i;
- res->input->children[i]->target[1] = 0;
+ res->input->children[i]->target[0] = (unsigned char *)
+ nmem_malloc (res->nmem, 2 * sizeof(unsigned char));
+ res->input->children[i]->target[0][0] = i;
+ res->input->children[i]->target[0][1] = 0;
}
else
- res->input->children[i]->target = (unsigned char*) CHR_UNKNOWN;
- res->input->children[i]->equiv = 0;
+ res->input->children[i]->target[0] = (unsigned char*) CHR_UNKNOWN;
}
- res->query_equiv = 0;
+ res->q_input = (chr_t_entry *)
+ nmem_malloc(res->nmem, sizeof(*res->q_input));
+ res->q_input->target = 0;
+ res->q_input->children = 0;
+
for (i = *CHR_BASE; i < 256; i++)
res->output[i] = 0;
res->output[(int) *CHR_SPACE] = (unsigned char *) " ";
res->output[(int) *CHR_UNKNOWN] = (unsigned char*) "@";
res->base_uppercase = 0;
- while ((argc = readconf_line(f, &lineno, line, 512, argv, 50)))
+ while (!errors && (argc = readconf_line(f, &lineno, line, 512, argv, 50)))
if (!map_only && !yaz_matchstr(argv[0], "lowercase"))
{
if (argc != 2)
{
logf(LOG_FATAL, "Syntax error in charmap");
- fclose(f);
- return 0;
+ ++errors;
}
if (scan_string(argv[1], fun_addentry, res, &num) < 0)
{
logf(LOG_FATAL, "Bad value-set specification");
- fclose(f);
- return 0;
+ ++errors;
}
res->base_uppercase = num;
res->output[(int) *CHR_SPACE + num] = (unsigned char *) " ";
if (!res->base_uppercase)
{
logf(LOG_FATAL, "Uppercase directive with no lowercase set");
- fclose(f);
- return 0;
+ ++errors;
}
if (argc != 2)
{
- logf(LOG_FATAL, "Syntax error in charmap");
- fclose(f);
- return 0;
+ logf(LOG_FATAL, "Missing arg for uppercase directive");
+ ++errors;
}
if (scan_string(argv[1], fun_addentry, res, &num) < 0)
{
logf(LOG_FATAL, "Bad value-set specification");
- fclose(f);
- return 0;
+ ++errors;
}
}
else if (!map_only && !yaz_matchstr(argv[0], "space"))
if (argc != 2)
{
logf(LOG_FATAL, "Syntax error in charmap");
- fclose(f);
- return 0;
+ ++errors;
}
if (scan_string(argv[1], fun_addspace, res, 0) < 0)
{
logf(LOG_FATAL, "Bad space specification");
- fclose(f);
- return 0;
+ ++errors;
}
}
else if (!yaz_matchstr(argv[0], "map"))
if (argc != 3)
{
- logf(LOG_FATAL, "charmap MAP directive requires 2 args");
- fclose(f);
- return 0;
+ logf(LOG_FATAL, "charmap directive map requires 2 args");
+ ++errors;
}
buf.map = res;
buf.string[0] = '\0';
if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0)
{
logf(LOG_FATAL, "Bad map target");
- fclose(f);
- return 0;
+ ++errors;
}
- if (scan_string(argv[1], fun_addmap, &buf, 0) < 0)
+ if (scan_string(argv[1], fun_add_map, &buf, 0) < 0)
{
logf(LOG_FATAL, "Bad map source");
- fclose(f);
- return 0;
+ ++errors;
+ }
+ }
+ else if (!yaz_matchstr(argv[0], "qmap"))
+ {
+ chrwork buf;
+
+ if (argc != 3)
+ {
+ logf(LOG_FATAL, "charmap directive qmap requires 2 args");
+ ++errors;
+ }
+ buf.map = res;
+ buf.string[0] = '\0';
+ if (scan_string(argv[2], fun_mkstring, &buf, 0) < 0)
+ {
+ logf(LOG_FATAL, "Bad qmap target");
+ ++errors;
+ }
+ if (scan_string(argv[1], fun_add_qmap, &buf, 0) < 0)
+ {
+ logf(LOG_FATAL, "Bad qmap source");
+ ++errors;
}
}
else
{
logf(LOG_WARN, "Syntax error at '%s' in %s", line, name);
}
+
fclose(f);
+ if (errors)
+ {
+ chrmaptab_destroy(res);
+ res = 0;
+ }
+ logf (LOG_LOG, "maptab %s close %d errors", name, errors);
return res;
}
void chrmaptab_destroy(chrmaptab tab)
{
- nmem_destroy (tab->nmem);
- xfree (tab);
+ if (tab)
+ nmem_destroy (tab->nmem);
}
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: zebramap.c,v $
- * Revision 1.15 1999-05-26 07:49:14 adam
+ * Revision 1.16 1999-09-07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.15 1999/05/26 07:49:14 adam
* C++ compilation.
*
* Revision 1.14 1999/02/19 10:37:40 adam
#define ZEBRA_MAP_TYPE_SORT 1
#define ZEBRA_MAP_TYPE_INDEX 2
+struct zm_token {
+ char *token_from;
+ char *token_to;
+ int token_min;
+ struct zm_token *next;
+};
+
struct zebra_map {
unsigned reg_id;
int completeness;
chrmaptab maptab;
const char *maptab_name;
struct zebra_map *next;
+ struct zm_token *replace_tokens;
};
struct zebra_maps {
char temp_map_str[2];
const char *temp_map_ptr[2];
struct zebra_map **lookup_array;
+ WRBUF wrbuf_1, wrbuf_2;
};
void zebra_maps_close (ZebraMaps zms)
chrmaptab_destroy (zm->maptab);
zm = zm->next;
}
+ wrbuf_free (zms->wrbuf_1, 1);
+ wrbuf_free (zms->wrbuf_2, 1);
nmem_destroy (zms->nmem);
xfree (zms);
}
(*zm)->type = ZEBRA_MAP_TYPE_INDEX;
(*zm)->completeness = 0;
(*zm)->positioned = 1;
+ (*zm)->replace_tokens = 0;
}
else if (!yaz_matchstr (argv[0], "sort") && argc == 2)
{
if ((*zm)->type == ZEBRA_MAP_TYPE_SORT)
(*zm)->u.sort.entry_size = atoi (argv[1]);
}
+ else if (zm && !yaz_matchstr (argv[0], "replace") && argc >= 2)
+ {
+ struct zm_token *token = nmem_malloc (zms->nmem, sizeof(*token));
+ char *cp, *dp;
+ token->next = (*zm)->replace_tokens;
+ (*zm)->replace_tokens = token;
+ dp = token->token_from = nmem_strdup (zms->nmem, cp = argv[1]);
+ while (*cp)
+ {
+ if (*cp == '$')
+ {
+ *dp++ = ' ';
+ cp++;
+ }
+ else
+ *dp++ = zebra_prim(&cp);
+ }
+ *dp = '\0';
+
+ if (argc >= 3)
+ {
+ dp = token->token_to = nmem_strdup (zms->nmem, cp = argv[2]);
+ while (*cp)
+ {
+ if (*cp == '$')
+ {
+ *dp++ = ' ';
+ cp++;
+ }
+ else
+ *dp++ = zebra_prim(&cp);
+ }
+ *dp = '\0';
+ }
+ else
+ token->token_to = 0;
+ }
}
if (zm)
(*zm)->next = NULL;
int i;
zms->nmem = nmem_create ();
- zms->tabpath = nmem_strdup (zms->nmem, res_get_def (res, "profilePath", "."));
+ zms->tabpath = nmem_strdup (zms->nmem,
+ res_get_def (res, "profilePath", "."));
zms->map_list = NULL;
zms->temp_map_str[0] = '\0';
zms->lookup_array[i] = 0;
if (!res || !res_trav (res, "index", zms, zms_map_handle))
zebra_map_read (zms, "default.idx");
+
+ zms->wrbuf_1 = wrbuf_alloc();
+ zms->wrbuf_2 = wrbuf_alloc();
return zms;
}
return zms->temp_map_ptr;
}
+#if 0
+int zebra_maps_input_tokens (ZebraMaps zms, unsigned reg_id,
+ const char *input_str, int input_len,
+ WRBUF wrbuf)
+{
+ chrmaptab maptab = zebra_charmap_get (zms, reg_id);
+ int len[4];
+ char *str[3];
+ int input_i = 0;
+ int first = 1;
+ const char **out;
+
+ if (!maptab)
+ {
+ wrbuf_write (wrbuf, input_str, input_len);
+ return -1;
+ }
+ str[0] = " ";
+ len[0] = 1;
+ str[1] = input_str;
+ len[1] = input_len;
+ str[2] = " ";
+ len[2] = 1;
+ len[3] = -1;
+
+ out = chr_map_input (maptab, str, len);
+ while (len[1] > 0)
+ {
+ while (out && *out && **out == *CHR_SPACE)
+ out = chr_map_input (maptab, str, len);
+ }
+}
+#endif
+
const char *zebra_maps_output(ZebraMaps zms, unsigned reg_id,
const char **from)
{
- chrmaptab maptab;
- unsigned char i = (unsigned char) **from;
- static char buf[2] = {0,0};
-
- maptab = zebra_charmap_get (zms, reg_id);
- if (maptab)
- return chr_map_output (maptab, from, 1);
- (*from)++;
- buf[0] = i;
- return buf;
+ chrmaptab maptab = zebra_charmap_get (zms, reg_id);
+ if (!maptab)
+ return 0;
+ return chr_map_output (maptab, from, 1);
}
}
return 0;
}
+
+int zebra_replace_sub(ZebraMaps zms, unsigned reg_id, const char *ex_list,
+ const char *input_str, int input_len, WRBUF wrbuf);
+
+WRBUF zebra_replace(ZebraMaps zms, unsigned reg_id, const char *ex_list,
+ const char *input_str, int input_len)
+{
+ struct zebra_map *zm = zebra_map_get (zms, reg_id);
+
+ wrbuf_rewind(zms->wrbuf_1);
+ wrbuf_write(zms->wrbuf_1, input_str, input_len);
+ if (!zm->replace_tokens)
+ return zms->wrbuf_1;
+
+#if 0
+ logf (LOG_LOG, "zebra_replace");
+ logf (LOG_LOG, "in:%.*s:", wrbuf_len(zms->wrbuf_1),
+ wrbuf_buf(zms->wrbuf_1));
+#endif
+ for (;;)
+ {
+ if (!zebra_replace_sub(zms, reg_id, ex_list, wrbuf_buf(zms->wrbuf_1),
+ wrbuf_len(zms->wrbuf_1), zms->wrbuf_2))
+ return zms->wrbuf_2;
+ if (!zebra_replace_sub(zms, reg_id, ex_list, wrbuf_buf(zms->wrbuf_2),
+ wrbuf_len(zms->wrbuf_2), zms->wrbuf_1))
+ return zms->wrbuf_1;
+ }
+ return 0;
+}
+
+int zebra_replace_sub(ZebraMaps zms, unsigned reg_id, const char *ex_list,
+ const char *input_str, int input_len, WRBUF wrbuf)
+{
+ int i = -1;
+ int no_replaces = 0;
+ struct zebra_map *zm = zebra_map_get (zms, reg_id);
+
+ wrbuf_rewind(wrbuf);
+ for (i = -1; i <= input_len; )
+ {
+ struct zm_token *token;
+ char replace_string[128];
+ int replace_out;
+ int replace_in = 0;
+
+ for (token = zm->replace_tokens; !replace_in && token;
+ token = token->next)
+ {
+ int j = 0;
+ int replace_done = 0;
+ replace_out = 0;
+ for (;; j++)
+ {
+ int c;
+ if (!token->token_from[j])
+ {
+ replace_in = j;
+ break;
+ }
+ if (ex_list && strchr (ex_list, token->token_from[j]))
+ break;
+ if (i+j < 0 || j+i >= input_len)
+ c = ' ';
+ else
+ c = tolower(input_str[j+i]);
+ if (token->token_from[j] == '.')
+ {
+ if (c == ' ')
+ break;
+ replace_string[replace_out++] = c;
+ }
+ else
+ {
+ if (c != token->token_from[j])
+ break;
+ if (!replace_done)
+ {
+ const char *cp = token->token_to;
+ replace_done = 1;
+ for (; cp && *cp; cp++)
+ replace_string[replace_out++] = *cp;
+ }
+ }
+ }
+ }
+ if (!replace_in)
+ {
+ if (i >= 0 && i < input_len)
+ wrbuf_putc(wrbuf, input_str[i]);
+ i++;
+ }
+ else
+ {
+ no_replaces++;
+ if (replace_out)
+ wrbuf_write(wrbuf, replace_string, replace_out);
+ i += replace_in;
+ }
+ }
+#if 0
+ logf (LOG_LOG, "out:%.*s:", wrbuf_len(wrbuf), wrbuf_buf(wrbuf));
+#endif
+ return no_replaces;
+}