-/* $Id: d1_absyn.c,v 1.3 2002-12-02 16:55:14 adam Exp $
+/* $Id: d1_absyn.c,v 1.4 2002-12-16 20:27:18 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
Index Data Aps
}
return NULL;
}
+/* *ostrich*
+ We need to destroy DFAs, in xp_element (xelm) definitions
+ pop, 2002-12-13
+*/
+
+void data1_absyn_destroy (data1_handle dh)
+{
+ data1_absyn_cache p = *data1_absyn_cache_get (dh);
+
+ while (p)
+ {
+ data1_absyn *abs = p->absyn;
+ data1_xpelement *xpe = abs->xp_elements;
+ while (xpe) {
+ logf (LOG_DEBUG,"Destroy xp element %s",xpe->xpath_expr);
+ if (xpe->dfa) { dfa_delete (&xpe->dfa); }
+ xpe = xpe->next;
+ }
+ p = p->next;
+ }
+}
+
void data1_absyn_trav (data1_handle dh, void *handle,
void (*fh)(data1_handle dh, void *h, data1_absyn *a))
}
}
}
+/* *ostrich*
+
+ New function, a bit dummy now... I've seen it in zrpn.c... We should build
+ more clever regexps...
+
+ //a -> ^a/.*$
+ //a/b -> ^b/a/.*$
+ /a -> ^a/$
+ /a/b -> ^b/a/$
+ / -> none
+
+ pop, 2002-12-13
+ */
+
+const char * mk_xpath_regexp (data1_handle dh, char *expr)
+{
+ char *p = expr;
+ int abs = 1;
+ int i;
+ int e=0;
+
+ static char *stack[32];
+ static char res[1024];
+ char *r = "";
+
+ if (*p != '/') { return (""); }
+ p++;
+ if (*p == '/') { abs=0; p++; }
+
+ while (*p) {
+ i=0;
+ while (*p && !strchr("/",*p)) { i++; p++; }
+ stack[e] = (char *) nmem_malloc (data1_nmem_get (dh), i+1);
+ memcpy (stack[e], p - i, i);
+ stack[e][i] = 0;
+ e++;
+ if (*p) {p++;}
+ }
+ e--; p = &res[0]; i=0;
+ sprintf (p, "^"); p++;
+ while (e >= 0) {
+ /* !!! res size is not checked !!! */
+ sprintf (p, "%s/",stack[e]);
+ p += strlen(stack[e]) + 1;
+ e--;
+ }
+ if (!abs) { sprintf (p, ".*"); p+=2; }
+ sprintf (p, "$"); p++;
+ r = nmem_strdup (data1_nmem_get (dh), res);
+ return (r);
+}
+
+/* *ostrich*
+
+ added arg xpelement... when called from xelm context, it's 1, saying
+ that ! means xpath, not element name as attribute name...
+
+ pop, 2002-12-13
+ */
static int parse_termlists (data1_handle dh, data1_termlist ***tpp,
char *p, const char *file, int lineno,
- const char *element_name, data1_absyn *res)
+ const char *element_name, data1_absyn *res,
+ int xpelement)
{
data1_termlist **tp = *tpp;
do
file, lineno, p);
return -1;
}
- if (*attname == '!')
- strcpy(attname, element_name);
+
*tp = (data1_termlist *)
- nmem_malloc(data1_nmem_get(dh), sizeof(**tp));
+ nmem_malloc(data1_nmem_get(dh), sizeof(**tp));
(*tp)->next = 0;
+
+ if (!xpelement) {
+ if (*attname == '!')
+ strcpy(attname, element_name);
+ }
if (!((*tp)->att = data1_getattbyname(dh, res->attset,
- attname)))
- {
- yaz_log(LOG_WARN,
- "%s:%d: Couldn't find att '%s' in attset",
- file, lineno, attname);
- return -1;
+ attname))) {
+ if ((!xpelement) || (*attname != '!')) {
+ yaz_log(LOG_WARN,
+ "%s:%d: Couldn't find att '%s' in attset",
+ file, lineno, attname);
+ return -1;
+ } else {
+ (*tp)->att = 0;
+ }
}
+
if (r == 2 && (source = strchr(structure, ':')))
*source++ = '\0'; /* cut off structure .. */
else
int file_must_exist)
{
data1_sub_elements *cur_elements = NULL;
+ data1_xpelement *cur_xpelement = NULL;
+
data1_absyn *res = 0;
FILE *f;
data1_element **ppl[D1_MAX_NESTING];
marcp = &res->marc;
res->sub_elements = NULL;
res->main_elements = NULL;
+ res->xp_elements = NULL;
while (f && (argc = readconf_line(f, &lineno, line, 512, argv, 50)))
{
{
assert (res->attset);
- if (parse_termlists (dh, &tp, p, file, lineno, name, res))
+ if (parse_termlists (dh, &tp, p, file, lineno, name, res, 0))
{
fclose (f);
return 0;
}
new_element->name = nmem_strdup(data1_nmem_get (dh), name);
}
+ /* *ostrich*
+ New code to support xelm directive
+ for each xelm a dfa is built. xelms are stored in res->xp_elements
+
+ maybe we should use a simple sscanf instead of dfa?
+
+ pop, 2002-12-13
+ */
+
+ else if (!strcmp(cmd, "xelm")) {
+
+ int i;
+ char *p, *xpath_expr, *termlists;
+ const char *regexp;
+ int type, value;
+ struct DFA *dfa = dfa = dfa_init();
+ data1_termlist **tp;
+
+ if (argc < 3)
+ {
+ yaz_log(LOG_WARN, "%s:%d: Bad # of args to xelm", file, lineno);
+ continue;
+ }
+ xpath_expr = argv[1];
+ termlists = argv[2];
+ regexp = mk_xpath_regexp(dh, xpath_expr);
+ i = dfa_parse (dfa, ®exp);
+ if (i || *regexp) {
+ yaz_log(LOG_WARN, "%s:%d: Bad xpath to xelm", file, lineno);
+ dfa_delete (&dfa);
+ continue;
+ }
+
+ if (!cur_xpelement)
+ {
+ cur_xpelement = (data1_xpelement *)
+ nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement));
+ res->xp_elements = cur_xpelement;
+ } else {
+ cur_xpelement->next = (data1_xpelement *)
+ nmem_malloc(data1_nmem_get(dh), sizeof(*cur_xpelement));
+ cur_xpelement = cur_xpelement->next;
+ }
+ cur_xpelement->next = NULL;
+ cur_xpelement->xpath_expr = nmem_strdup(data1_nmem_get (dh),
+ xpath_expr);
+
+ dfa_mkstate (dfa);
+ cur_xpelement->dfa = dfa;
+
+ cur_xpelement->termlists = 0;
+ tp = &cur_xpelement->termlists;
+
+ /* parse termList definitions */
+ p = termlists;
+ if (*p != '-')
+ {
+ assert (res->attset);
+
+ if (parse_termlists (dh, &tp, p, file, lineno,
+ xpath_expr, res,1))
+ {
+ fclose (f);
+ return 0;
+ }
+ *tp = all; /* append any ALL entries to the list */
+
+ }
+ }
else if (!strcmp(cmd, "section"))
{
char *name;
if (argc < 2)
{
yaz_log(LOG_WARN, "%s:%d: Bad # of args to section",
- file, lineno);
+ file, lineno);
continue;
}
name = argv[1];
file, lineno);
continue;
}
- if (parse_termlists (dh, &tp, argv[1], file, lineno, 0, res))
+ if (parse_termlists (dh, &tp, argv[1], file, lineno, 0, res, 0))
{
fclose (f);
return 0;
-/* $Id: d1_handle.c,v 1.2 2002-10-22 13:19:50 adam Exp $
+/* $Id: d1_handle.c,v 1.3 2002-12-16 20:27:18 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
Index Data Aps
{
if (!dh)
return;
+
+ /* *ostrich*
+ We need to destroy DFAs, in xp_element (xelm) definitions
+ pop, 2002-12-13
+ */
+ data1_absyn_destroy(dh);
+
wrbuf_free (dh->wrbuf, 1);
if (dh->tab_path)
xfree (dh->tab_path);
-/* $Id: data1.h,v 1.3 2002-12-02 16:55:14 adam Exp $
+/* $Id: data1.h,v 1.4 2002-12-16 20:27:18 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
Index Data Aps
#include <d1_map.h>
#include <yaz/yaz-util.h>
#include <yaz/wrbuf.h>
+#include <dfa.h> /* pop */
#define d1_isspace(c) strchr(" \r\n\t\f", c)
#define d1_isdigit(c) ((c) <= '9' && (c) >= '0')
data1_element *elements;
} data1_sub_elements;
+/* pop */
+typedef struct data1_xpelement
+{
+ char *xpath_expr;
+ struct DFA *dfa;
+ data1_termlist *termlists;
+ struct data1_xpelement *next;
+} data1_xpelement;
+
typedef struct data1_xattr {
char *name;
char *value;
data1_marctab *marc;
data1_sub_elements *sub_elements;
data1_element *main_elements;
+ data1_xpelement *xp_elements; /* pop */
struct data1_systag *systags;
char *encoding;
int enable_xpath_indexing;
-/* $Id: recgrs.c,v 1.70 2002-12-02 16:55:14 adam Exp $
+/* $Id: recgrs.c,v 1.71 2002-12-16 20:27:18 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
Index Data Aps
xfree (h);
}
+/* *ostrich*
+
+ New function, looking for xpath "element" definitions in abs, by
+ tagpath, using a kind of ugly regxp search.The DFA was built while
+ parsing abs, so here we just go trough them and try to match
+ against the given tagpath. The first matching entry is returned.
+
+ pop, 2002-12-13
+ */
+
+data1_termlist *xpath_termlist_by_tagpath(char *tagpath, data1_node *n)
+{
+ data1_absyn *abs = n->root->u.root.absyn;
+ data1_xpelement *xpe = abs->xp_elements;
+ char *pexpr = malloc(strlen(tagpath)+2);
+ int ok = 0;
+
+ sprintf (pexpr, "%s\n", tagpath);
+ while (xpe)
+ {
+ struct DFA_state **dfaar = xpe->dfa->states;
+ struct DFA_state *s=dfaar[0];
+ struct DFA_tran *t;
+ const char *p;
+ int i;
+ unsigned char c;
+ int start_line = 1;
+
+ c = *pexpr++; t = s->trans; i = s->tran_no;
+ if (c >= t->ch[0] && c <= t->ch[1]) {
+ p = pexpr;
+ do {
+ if ((s = dfaar[t->to])->rule_no &&
+ (start_line || s->rule_nno)) {
+ ok = 1;
+ break;
+ }
+ for (t=s->trans, i=s->tran_no; --i >= 0; t++) {
+ if ((unsigned) *p >= t->ch[0] && (unsigned) *p <= t->ch[1])
+ break;
+ }
+ p++;
+ } while (i >= 0);
+ }
+ pexpr--;
+ if (ok) break;
+ xpe = xpe->next;
+ }
+
+ if (ok) {
+ return xpe->termlists;
+ } else {
+ return NULL;
+ }
+}
+
/* use
1 start element (tag)
2 end element
1016 cdata
1015 attr data
+
+ *ostrich*
+
+ Now, if there is a matching xelm described in abs, for the
+ indexed element or the attribute, then the data is handled according
+ to those definitions...
+
+ modified by pop, 2002-12-13
*/
static void index_xpath (data1_node *n, struct recExtractCtrl *p,
switch (n->which)
{
case DATA1N_data:
- wrd->reg_type = 'w';
wrd->string = n->u.data.data;
wrd->length = n->u.data.len;
- wrd->attrSet = VAL_IDXPATH,
- wrd->attrUse = use;
if (p->flagShowRecords)
{
printf("%*s data=", (level + 1) * 4, "");
for (i = 0; i<wrd->length && i < 8; i++)
fputc (wrd->string[i], stdout);
printf("\n");
- }
- else
- {
- (*p->tokenAdd)(wrd);
+ }
+ else {
+ data1_termlist *tl;
+ int xpdone = 0;
+ flen = 0;
+
+ /* we have to fetch the whole path to the data tag */
+ for (nn = n; nn; nn = nn->parent) {
+ if (nn->which == DATA1N_tag) {
+ size_t tlen = strlen(nn->u.tag.tag);
+ if (tlen + flen > (sizeof(tag_path_full)-2)) return;
+ memcpy (tag_path_full + flen, nn->u.tag.tag, tlen);
+ flen += tlen;
+ tag_path_full[flen++] = '/';
+ }
+ else if (nn->which == DATA1N_root) break;
+ }
+
+ tag_path_full[flen] = 0;
+
+ /* If we have a matching termlist... */
+ if (tl = xpath_termlist_by_tagpath(tag_path_full, n)) {
+ for (; tl; tl = tl->next) {
+ wrd->reg_type = *tl->structure;
+ /* this is the ! case, so structure is for the xpath index */
+ if (!tl->att) {
+ wrd->attrSet = VAL_IDXPATH;
+ wrd->attrUse = use;
+ (*p->tokenAdd)(wrd);
+ xpdone = 1;
+ /* this is just the old fashioned attribute based index */
+ } else {
+ wrd->attrSet = (int) (tl->att->parent->reference);
+ wrd->attrUse = tl->att->locals->local;
+ (*p->tokenAdd)(wrd);
+ }
+ }
+ }
+ /* xpath indexing is done, if there was no termlist given,
+ or no ! attribute... */
+ if (!xpdone) {
+ wrd->attrSet = VAL_IDXPATH;
+ wrd->attrUse = use;
+ wrd->reg_type = 'w';
+ (*p->tokenAdd)(wrd);
+ }
}
break;
case DATA1N_tag:
+ flen = 0;
for (nn = n; nn; nn = nn->parent)
{
if (nn->which == DATA1N_tag)
else if (nn->which == DATA1N_root)
break;
}
+
+
wrd->reg_type = '0';
wrd->string = tag_path_full;
wrd->length = flen;
sprintf (attr_tag_path_full, "@%s/%.*s",
xp->name, int_len, tag_path_full);
-
wrd->reg_type = '0';
wrd->attrUse = 1;
wrd->string = attr_tag_path_full;
if (xp->value)
{
- wrd->attrUse = 1015;
- wrd->reg_type = 'w';
+ /* the same jokes, as with the data nodes ... */
+ data1_termlist *tl;
+ int xpdone = 0;
+
wrd->string = xp->value;
wrd->length = strlen(xp->value);
- (*p->tokenAdd)(wrd);
+ wrd->reg_type = 'w';
+
+ if (tl = xpath_termlist_by_tagpath(attr_tag_path_full,
+ n)) {
+ for (; tl; tl = tl->next) {
+ wrd->reg_type = *tl->structure;
+ if (!tl->att) {
+ wrd->attrSet = VAL_IDXPATH;
+ wrd->attrUse = 1015;
+ (*p->tokenAdd)(wrd);
+ xpdone = 1;
+ } else {
+ wrd->attrSet = (int) (tl->att->parent->reference);
+ wrd->attrUse = tl->att->locals->local;
+ (*p->tokenAdd)(wrd);
+ }
+ }
+
+ }
+ if (!xpdone) {
+ wrd->attrSet = VAL_IDXPATH;
+ wrd->attrUse = 1015;
+ wrd->reg_type = 'w';
+ (*p->tokenAdd)(wrd);
+ }
}
+ wrd->attrSet = VAL_IDXPATH;
wrd->reg_type = '0';
wrd->attrUse = 2;
wrd->string = attr_tag_path_full;
{
data1_termlist *tlist = 0;
data1_datatype dtype = DATA1K_string;
+
/*
* cycle up towards the root until we find a tag with an att..
* this has the effect of indexing locally defined tags with
*/
while (!par->u.tag.element)
- if (!par->parent || !(par=get_parent_tag(p->dh, par->parent)))
- break;
+ if (!par->parent || !(par=get_parent_tag(p->dh, par->parent)))
+ break;
if (!par || !(tlist = par->u.tag.element->termlists))
- return;
+ return;
if (par->u.tag.element->tag)
- dtype = par->u.tag.element->tag->kind;
+ dtype = par->u.tag.element->tag->kind;
for (; tlist; tlist = tlist->next)
{
+
char xattr[512];
/* consider source */
wrd->string = 0;
-
+
if (!strcmp (tlist->source, "data") && n->which == DATA1N_data)
{
wrd->string = n->u.data.data;