From: Adam Dickmeiss Date: Mon, 16 Dec 2002 20:27:18 +0000 (+0000) Subject: xelm X-Git-Tag: ZEBRA.1.3.4.Roel~11 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=b17cf6fd6cad7283033afe18fc346250ec1b2ad3;p=idzebra-moved-to-github.git xelm --- diff --git a/data1/d1_absyn.c b/data1/d1_absyn.c index 49f3e5d..8e88059 100644 --- a/data1/d1_absyn.c +++ b/data1/d1_absyn.c @@ -1,4 +1,4 @@ -/* $Id: d1_absyn.c,v 1.3 2002-12-02 16:55:14 adam Exp $ +/* $Id: d1_absyn.c,v 1.4 2002-12-16 20:27:18 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -63,6 +63,28 @@ data1_absyn *data1_absyn_search (data1_handle dh, const char *name) } return NULL; } +/* *ostrich* + We need to destroy DFAs, in xp_element (xelm) definitions + pop, 2002-12-13 +*/ + +void data1_absyn_destroy (data1_handle dh) +{ + data1_absyn_cache p = *data1_absyn_cache_get (dh); + + while (p) + { + data1_absyn *abs = p->absyn; + data1_xpelement *xpe = abs->xp_elements; + while (xpe) { + logf (LOG_DEBUG,"Destroy xp element %s",xpe->xpath_expr); + if (xpe->dfa) { dfa_delete (&xpe->dfa); } + xpe = xpe->next; + } + p = p->next; + } +} + void data1_absyn_trav (data1_handle dh, void *handle, void (*fh)(data1_handle dh, void *h, data1_absyn *a)) @@ -246,11 +268,71 @@ void fix_element_ref (data1_handle dh, data1_absyn *absyn, data1_element *e) } } } +/* *ostrich* + + New function, a bit dummy now... I've seen it in zrpn.c... We should build + more clever regexps... + + //a -> ^a/.*$ + //a/b -> ^b/a/.*$ + /a -> ^a/$ + /a/b -> ^b/a/$ + / -> none + + pop, 2002-12-13 + */ + +const char * mk_xpath_regexp (data1_handle dh, char *expr) +{ + char *p = expr; + int abs = 1; + int i; + int e=0; + + static char *stack[32]; + static char res[1024]; + char *r = ""; + + if (*p != '/') { return (""); } + p++; + if (*p == '/') { abs=0; p++; } + + while (*p) { + i=0; + while (*p && !strchr("/",*p)) { i++; p++; } + stack[e] = (char *) nmem_malloc (data1_nmem_get (dh), i+1); + memcpy (stack[e], p - i, i); + stack[e][i] = 0; + e++; + if (*p) {p++;} + } + e--; p = &res[0]; i=0; + sprintf (p, "^"); p++; + while (e >= 0) { + /* !!! res size is not checked !!! */ + sprintf (p, "%s/",stack[e]); + p += strlen(stack[e]) + 1; + e--; + } + if (!abs) { sprintf (p, ".*"); p+=2; } + sprintf (p, "$"); p++; + r = nmem_strdup (data1_nmem_get (dh), res); + return (r); +} + +/* *ostrich* + + added arg xpelement... when called from xelm context, it's 1, saying + that ! means xpath, not element name as attribute name... + + pop, 2002-12-13 + */ static int parse_termlists (data1_handle dh, data1_termlist ***tpp, char *p, const char *file, int lineno, - const char *element_name, data1_absyn *res) + const char *element_name, data1_absyn *res, + int xpelement) { data1_termlist **tp = *tpp; do @@ -267,19 +349,27 @@ static int parse_termlists (data1_handle dh, data1_termlist ***tpp, file, lineno, p); return -1; } - if (*attname == '!') - strcpy(attname, element_name); + *tp = (data1_termlist *) - nmem_malloc(data1_nmem_get(dh), sizeof(**tp)); + nmem_malloc(data1_nmem_get(dh), sizeof(**tp)); (*tp)->next = 0; + + if (!xpelement) { + if (*attname == '!') + strcpy(attname, element_name); + } if (!((*tp)->att = data1_getattbyname(dh, res->attset, - attname))) - { - yaz_log(LOG_WARN, - "%s:%d: Couldn't find att '%s' in attset", - file, lineno, attname); - return -1; + attname))) { + if ((!xpelement) || (*attname != '!')) { + yaz_log(LOG_WARN, + "%s:%d: Couldn't find att '%s' in attset", + file, lineno, attname); + return -1; + } else { + (*tp)->att = 0; + } } + if (r == 2 && (source = strchr(structure, ':'))) *source++ = '\0'; /* cut off structure .. */ else @@ -313,6 +403,8 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, int file_must_exist) { data1_sub_elements *cur_elements = NULL; + data1_xpelement *cur_xpelement = NULL; + data1_absyn *res = 0; FILE *f; data1_element **ppl[D1_MAX_NESTING]; @@ -357,6 +449,7 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, marcp = &res->marc; res->sub_elements = NULL; res->main_elements = NULL; + res->xp_elements = NULL; while (f && (argc = readconf_line(f, &lineno, line, 512, argv, 50))) { @@ -473,7 +566,7 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, { assert (res->attset); - if (parse_termlists (dh, &tp, p, file, lineno, name, res)) + if (parse_termlists (dh, &tp, p, file, lineno, name, res, 0)) { fclose (f); return 0; @@ -482,6 +575,75 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, } new_element->name = nmem_strdup(data1_nmem_get (dh), name); } + /* *ostrich* + New code to support xelm directive + for each xelm a dfa is built. xelms are stored in res->xp_elements + + maybe we should use a simple sscanf instead of dfa? + + pop, 2002-12-13 + */ + + else if (!strcmp(cmd, "xelm")) { + + int i; + char *p, *xpath_expr, *termlists; + const char *regexp; + int type, value; + struct DFA *dfa = dfa = dfa_init(); + data1_termlist **tp; + + if (argc < 3) + { + yaz_log(LOG_WARN, "%s:%d: Bad # of args to xelm", file, lineno); + continue; + } + xpath_expr = argv[1]; + termlists = argv[2]; + regexp = mk_xpath_regexp(dh, xpath_expr); + i = dfa_parse (dfa, ®exp); + if (i || *regexp) { + yaz_log(LOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno); + dfa_delete (&dfa); + continue; + } + + if (!cur_xpelement) + { + cur_xpelement = (data1_xpelement *) + nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement)); + res->xp_elements = cur_xpelement; + } else { + cur_xpelement->next = (data1_xpelement *) + nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement)); + cur_xpelement = cur_xpelement->next; + } + cur_xpelement->next = NULL; + cur_xpelement->xpath_expr = nmem_strdup(data1_nmem_get (dh), + xpath_expr); + + dfa_mkstate (dfa); + cur_xpelement->dfa = dfa; + + cur_xpelement->termlists = 0; + tp = &cur_xpelement->termlists; + + /* parse termList definitions */ + p = termlists; + if (*p != '-') + { + assert (res->attset); + + if (parse_termlists (dh, &tp, p, file, lineno, + xpath_expr, res,1)) + { + fclose (f); + return 0; + } + *tp = all; /* append any ALL entries to the list */ + + } + } else if (!strcmp(cmd, "section")) { char *name; @@ -489,7 +651,7 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, if (argc < 2) { yaz_log(LOG_WARN, "%s:%d: Bad # of args to section", - file, lineno); + file, lineno); continue; } name = argv[1]; @@ -537,7 +699,7 @@ data1_absyn *data1_read_absyn (data1_handle dh, const char *file, file, lineno); continue; } - if (parse_termlists (dh, &tp, argv[1], file, lineno, 0, res)) + if (parse_termlists (dh, &tp, argv[1], file, lineno, 0, res, 0)) { fclose (f); return 0; diff --git a/data1/d1_handle.c b/data1/d1_handle.c index 6f03aa5..f990653 100644 --- a/data1/d1_handle.c +++ b/data1/d1_handle.c @@ -1,4 +1,4 @@ -/* $Id: d1_handle.c,v 1.2 2002-10-22 13:19:50 adam Exp $ +/* $Id: d1_handle.c,v 1.3 2002-12-16 20:27:18 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -87,6 +87,13 @@ void data1_destroy (data1_handle dh) { if (!dh) return; + + /* *ostrich* + We need to destroy DFAs, in xp_element (xelm) definitions + pop, 2002-12-13 + */ + data1_absyn_destroy(dh); + wrbuf_free (dh->wrbuf, 1); if (dh->tab_path) xfree (dh->tab_path); diff --git a/include/data1.h b/include/data1.h index 33958f9..8b48a06 100644 --- a/include/data1.h +++ b/include/data1.h @@ -1,4 +1,4 @@ -/* $Id: data1.h,v 1.3 2002-12-02 16:55:14 adam Exp $ +/* $Id: data1.h,v 1.4 2002-12-16 20:27:18 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -33,6 +33,7 @@ Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include #include #include +#include /* pop */ #define d1_isspace(c) strchr(" \r\n\t\f", c) #define d1_isdigit(c) ((c) <= '9' && (c) >= '0') @@ -187,6 +188,15 @@ typedef struct data1_sub_elements { data1_element *elements; } data1_sub_elements; +/* pop */ +typedef struct data1_xpelement +{ + char *xpath_expr; + struct DFA *dfa; + data1_termlist *termlists; + struct data1_xpelement *next; +} data1_xpelement; + typedef struct data1_xattr { char *name; char *value; @@ -209,6 +219,7 @@ typedef struct data1_absyn data1_marctab *marc; data1_sub_elements *sub_elements; data1_element *main_elements; + data1_xpelement *xp_elements; /* pop */ struct data1_systag *systags; char *encoding; int enable_xpath_indexing; diff --git a/recctrl/recgrs.c b/recctrl/recgrs.c index 28ce920..4f3ab50 100644 --- a/recctrl/recgrs.c +++ b/recctrl/recgrs.c @@ -1,4 +1,4 @@ -/* $Id: recgrs.c,v 1.70 2002-12-02 16:55:14 adam Exp $ +/* $Id: recgrs.c,v 1.71 2002-12-16 20:27:18 adam Exp $ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002 Index Data Aps @@ -123,6 +123,62 @@ static void grs_destroy(void *clientData) xfree (h); } +/* *ostrich* + + New function, looking for xpath "element" definitions in abs, by + tagpath, using a kind of ugly regxp search.The DFA was built while + parsing abs, so here we just go trough them and try to match + against the given tagpath. The first matching entry is returned. + + pop, 2002-12-13 + */ + +data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n) +{ + data1_absyn *abs = n->root->u.root.absyn; + data1_xpelement *xpe = abs->xp_elements; + char *pexpr = malloc(strlen(tagpath)+2); + int ok = 0; + + sprintf (pexpr, "%s\n", tagpath); + while (xpe) + { + struct DFA_state **dfaar = xpe->dfa->states; + struct DFA_state *s=dfaar[0]; + struct DFA_tran *t; + const char *p; + int i; + unsigned char c; + int start_line = 1; + + c = *pexpr++; t = s->trans; i = s->tran_no; + if (c >= t->ch[0] && c <= t->ch[1]) { + p = pexpr; + do { + if ((s = dfaar[t->to])->rule_no && + (start_line || s->rule_nno)) { + ok = 1; + break; + } + for (t=s->trans, i=s->tran_no; --i >= 0; t++) { + if ((unsigned) *p >= t->ch[0] && (unsigned) *p <= t->ch[1]) + break; + } + p++; + } while (i >= 0); + } + pexpr--; + if (ok) break; + xpe = xpe->next; + } + + if (ok) { + return xpe->termlists; + } else { + return NULL; + } +} + /* use 1 start element (tag) 2 end element @@ -131,6 +187,14 @@ static void grs_destroy(void *clientData) 1016 cdata 1015 attr data + + *ostrich* + + Now, if there is a matching xelm described in abs, for the + indexed element or the attribute, then the data is handled according + to those definitions... + + modified by pop, 2002-12-13 */ static void index_xpath (data1_node *n, struct recExtractCtrl *p, @@ -144,24 +208,64 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p, switch (n->which) { case DATA1N_data: - wrd->reg_type = 'w'; wrd->string = n->u.data.data; wrd->length = n->u.data.len; - wrd->attrSet = VAL_IDXPATH, - wrd->attrUse = use; if (p->flagShowRecords) { printf("%*s data=", (level + 1) * 4, ""); for (i = 0; ilength && i < 8; i++) fputc (wrd->string[i], stdout); printf("\n"); - } - else - { - (*p->tokenAdd)(wrd); + } + else { + data1_termlist *tl; + int xpdone = 0; + flen = 0; + + /* we have to fetch the whole path to the data tag */ + for (nn = n; nn; nn = nn->parent) { + if (nn->which == DATA1N_tag) { + size_t tlen = strlen(nn->u.tag.tag); + if (tlen + flen > (sizeof(tag_path_full)-2)) return; + memcpy (tag_path_full + flen, nn->u.tag.tag, tlen); + flen += tlen; + tag_path_full[flen++] = '/'; + } + else if (nn->which == DATA1N_root) break; + } + + tag_path_full[flen] = 0; + + /* If we have a matching termlist... */ + if (tl = xpath_termlist_by_tagpath(tag_path_full, n)) { + for (; tl; tl = tl->next) { + wrd->reg_type = *tl->structure; + /* this is the ! case, so structure is for the xpath index */ + if (!tl->att) { + wrd->attrSet = VAL_IDXPATH; + wrd->attrUse = use; + (*p->tokenAdd)(wrd); + xpdone = 1; + /* this is just the old fashioned attribute based index */ + } else { + wrd->attrSet = (int) (tl->att->parent->reference); + wrd->attrUse = tl->att->locals->local; + (*p->tokenAdd)(wrd); + } + } + } + /* xpath indexing is done, if there was no termlist given, + or no ! attribute... */ + if (!xpdone) { + wrd->attrSet = VAL_IDXPATH; + wrd->attrUse = use; + wrd->reg_type = 'w'; + (*p->tokenAdd)(wrd); + } } break; case DATA1N_tag: + flen = 0; for (nn = n; nn; nn = nn->parent) { if (nn->which == DATA1N_tag) @@ -176,6 +280,8 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p, else if (nn->which == DATA1N_root) break; } + + wrd->reg_type = '0'; wrd->string = tag_path_full; wrd->length = flen; @@ -232,7 +338,6 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p, sprintf (attr_tag_path_full, "@%s/%.*s", xp->name, int_len, tag_path_full); - wrd->reg_type = '0'; wrd->attrUse = 1; wrd->string = attr_tag_path_full; @@ -241,13 +346,40 @@ static void index_xpath (data1_node *n, struct recExtractCtrl *p, if (xp->value) { - wrd->attrUse = 1015; - wrd->reg_type = 'w'; + /* the same jokes, as with the data nodes ... */ + data1_termlist *tl; + int xpdone = 0; + wrd->string = xp->value; wrd->length = strlen(xp->value); - (*p->tokenAdd)(wrd); + wrd->reg_type = 'w'; + + if (tl = xpath_termlist_by_tagpath(attr_tag_path_full, + n)) { + for (; tl; tl = tl->next) { + wrd->reg_type = *tl->structure; + if (!tl->att) { + wrd->attrSet = VAL_IDXPATH; + wrd->attrUse = 1015; + (*p->tokenAdd)(wrd); + xpdone = 1; + } else { + wrd->attrSet = (int) (tl->att->parent->reference); + wrd->attrUse = tl->att->locals->local; + (*p->tokenAdd)(wrd); + } + } + + } + if (!xpdone) { + wrd->attrSet = VAL_IDXPATH; + wrd->attrUse = 1015; + wrd->reg_type = 'w'; + (*p->tokenAdd)(wrd); + } } + wrd->attrSet = VAL_IDXPATH; wrd->reg_type = '0'; wrd->attrUse = 2; wrd->string = attr_tag_path_full; @@ -264,6 +396,7 @@ static void index_termlist (data1_node *par, data1_node *n, { data1_termlist *tlist = 0; data1_datatype dtype = DATA1K_string; + /* * cycle up towards the root until we find a tag with an att.. * this has the effect of indexing locally defined tags with @@ -271,19 +404,20 @@ static void index_termlist (data1_node *par, data1_node *n, */ while (!par->u.tag.element) - if (!par->parent || !(par=get_parent_tag(p->dh, par->parent))) - break; + if (!par->parent || !(par=get_parent_tag(p->dh, par->parent))) + break; if (!par || !(tlist = par->u.tag.element->termlists)) - return; + return; if (par->u.tag.element->tag) - dtype = par->u.tag.element->tag->kind; + dtype = par->u.tag.element->tag->kind; for (; tlist; tlist = tlist->next) { + char xattr[512]; /* consider source */ wrd->string = 0; - + if (!strcmp (tlist->source, "data") && n->which == DATA1N_data) { wrd->string = n->u.data.data;