-/* $Id: mod_dom.c,v 1.16 2007-02-18 21:53:22 adam Exp $
+/* $Id: mod_dom.c,v 1.40 2007-10-21 19:39:00 adam Exp $
Copyright (C) 1995-2007
Index Data ApS
#include <stdio.h>
#include <assert.h>
#include <ctype.h>
+#include <stdarg.h>
#include <yaz/diagbib1.h>
#include <yaz/tpath.h>
+#include <yaz/snprintf.h>
#include <libxml/xmlversion.h>
#include <libxml/parser.h>
#include <idzebra/util.h>
#include <idzebra/recctrl.h>
+#include <yaz/oid_db.h>
/* DOM filter style indexing */
#define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
int type;
union {
struct {
+ xmlTextReaderPtr reader;
+ int split_level;
+ } xmlreader;
+ struct {
const char *input_charset;
yaz_marc_t handle;
yaz_iconv_t iconv;
} marc;
- struct {
- xmlTextReaderPtr reader;
- int split_level;
- } xmlreader;
} u;
struct filter_input *next;
};
struct filter_retrieve *retrieve_list;
struct filter_input *input_list;
struct filter_store *store;
+ int record_info_invoked;
};
+
+
#define XML_STRCMP(a,b) strcmp((char*)a, b)
#define XML_STRLEN(a) strlen((char*)a)
+#define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
+
+static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
+ const char *fmt, ...)
+#ifdef __GNUC__
+ __attribute__ ((format (printf, 4, 5)))
+#endif
+ ;
+
+static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
+ const char *fmt, ...)
+{
+ va_list ap;
+ char buf[4096];
+
+ va_start(ap, fmt);
+ yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
+ if (ptr)
+ {
+ yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
+ xmlGetLineNo(ptr), buf);
+ }
+ else
+ {
+ yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
+ }
+ va_end(ap);
+}
static void set_param_str(const char **params, const char *name,
tinfo->input_list = 0;
tinfo->store = 0;
tinfo->doc_config = 0;
+ tinfo->record_info_invoked = 0;
#if YAZ_HAVE_EXSLT
exsltRegisterAll();
struct convert_s **l)
{
*l = 0;
- for(; ptr; ptr = ptr->next)
- {
- if (ptr->type != XML_ELEMENT_NODE)
- continue;
+ FOR_EACH_ELEMENT(ptr) {
if (!XML_STRCMP(ptr->name, "xslt"))
{
struct _xmlAttr *attr;
;
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad attribute @%s, "
- "expected @stylesheet",
- tinfo->fname,
- node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad attribute @%s", attr->name);
}
if (p->stylesheet)
{
NULL,
tmp_xslt_full_name))
{
- yaz_log(YLOG_WARN, "%s: dom filter: "
+ dom_log(YLOG_WARN, tinfo, 0,
"stylesheet %s not found in "
"path %s",
- tinfo->fname,
p->stylesheet,
tinfo->profile_path);
return ZEBRA_FAIL;
tmp_xslt_full_name);
if (!p->stylesheet_xsp)
{
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "could not parse xslt "
- "stylesheet %s",
- tinfo->fname, tmp_xslt_full_name);
+ dom_log(YLOG_WARN, tinfo, 0,
+ "could not parse xslt stylesheet %s",
+ tmp_xslt_full_name);
return ZEBRA_FAIL;
}
- }
- else
- {
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s missing attribute 'stylesheet' ",
- tinfo->fname, node_path);
- xmlFree(node_path);
- return ZEBRA_FAIL;
- }
- *l = p;
- l = &p->next;
+ }
+ else
+ {
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "missing attribute 'stylesheet' ");
+ return ZEBRA_FAIL;
+ }
+ *l = p;
+ l = &p->next;
}
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_LOG,
- "%s: dom filter: "
- "%s bad node '%s'",
- tinfo->fname, node_path, ptr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad element '%s', expected <xslt>", ptr->name);
return ZEBRA_FAIL;
}
}
}
static ZEBRA_RES perform_convert(struct filter_info *tinfo,
+ struct recExtractCtrl *extctr,
struct convert_s *convert,
const char **params,
xmlDocPtr *doc,
{
for (; convert; convert = convert->next)
{
+ xmlChar *buf_out = 0;
+ int len_out = 0;
xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
*doc, params);
if (last_xsp)
*last_xsp = convert->stylesheet_xsp;
+
+ if (!res_doc)
+ break;
+
+ /* now saving into buffer and re-reading into DOM to avoid annoing
+ XSLT problem with thrown-out indentation text nodes */
+ xsltSaveResultToString(&buf_out, &len_out, res_doc,
+ convert->stylesheet_xsp);
+ xmlFreeDoc(res_doc);
+
xmlFreeDoc(*doc);
- *doc = res_doc;
+
+ *doc = xmlParseMemory((const char *) buf_out, len_out);
+
+ /* writing debug info out */
+ if (extctr && extctr->flagShowRecords)
+ yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
+ tinfo->fname ? tinfo->fname : "(none)",
+ convert->stylesheet,
+ len_out, buf_out);
+
+ xmlFree(buf_out);
}
return ZEBRA_OK;
}
}
static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
- const char *syntax,
- const char *name)
+ const char *syntax, const char *name)
{
- for (; ptr; ptr = ptr->next)
- {
- if (ptr->type != XML_ELEMENT_NODE)
- continue;
+ FOR_EACH_ELEMENT(ptr) {
if (!XML_STRCMP(ptr->name, "marc"))
{
yaz_iconv_t iconv = 0;
for (attr = ptr->properties; attr; attr = attr->next)
{
- if (attr_content(attr, "charset", &input_charset))
+ if (attr_content(attr, "inputcharset", &input_charset))
;
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad attribute @%s,"
- " expected @charset",
- tinfo->fname,
- node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad attribute @%s, expected @inputcharset",
+ attr->name);
}
}
iconv = yaz_iconv_open("utf-8", input_charset);
if (!iconv)
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s unsupported @charset '%s'",
- tinfo->fname, node_path,
- input_charset);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "unsupported @charset '%s'", input_charset);
return ZEBRA_FAIL;
}
else
;
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad attribute @%s,"
- " expected @level",
- tinfo->fname, node_path,
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad attribute @%s, expected @level",
attr->name);
- xmlFree(node_path);
}
}
if (level_str)
}
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad element <%s>,"
- " expected <marc>|<xmlreader>",
- tinfo->fname, node_path, ptr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad element <%s>, expected <marc>|<xmlreader>",
+ ptr->name);
return ZEBRA_FAIL;
}
}
yaz_log(YLOG_LOG, "%s dom filter: "
"loading config file %s", tinfo->fname, tinfo->full_name);
-
+
doc = xmlParseFile(tinfo->full_name);
if (!doc)
{
if (!ptr || ptr->type != XML_ELEMENT_NODE
|| XML_STRCMP(ptr->name, "dom"))
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad root element <%s>,"
- " expected root element <dom>",
- tinfo->fname, node_path, ptr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad root element <%s>, expected root element <dom>",
+ ptr->name);
return ZEBRA_FAIL;
}
- for (ptr = ptr->children; ptr; ptr = ptr->next)
- {
- if (ptr->type != XML_ELEMENT_NODE)
- continue;
+ ptr = ptr->children;
+ FOR_EACH_ELEMENT(ptr) {
if (!XML_STRCMP(ptr->name, "extract"))
{
/*
;
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad attribute @%s"
- " expected @name",
- tinfo->fname,
- node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad attribute @%s, expected @name",
+ attr->name);
}
}
parse_convert(tinfo, ptr->children, &f->convert);
;
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad attribute @%s"
- " expected @identifier|@name",
- tinfo->fname,
- node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad attribute @%s, expected @identifier|@name",
+ attr->name);
}
}
parse_convert(tinfo, ptr->children, &f->convert);
;
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad attribute @%s"
- " expected @syntax|@name",
- tinfo->fname,
- node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad attribute @%s, expected @syntax|@name",
+ attr->name);
}
}
parse_input(tinfo, ptr->children, syntax, name);
}
else
{
- xmlChar *node_path = xmlGetNodePath(ptr);
- yaz_log(YLOG_WARN, "%s: dom filter: "
- "%s bad element <%s>,"
- " expected <extract>|<input>|<retrieve>|<store>",
- tinfo->fname, node_path, ptr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, ptr,
+ "bad element <%s>, "
+ "expected <extract>|<input>|<retrieve>|<store>",
+ ptr->name);
return ZEBRA_FAIL;
}
}
+ if (!tinfo->input_list)
+ {
+ struct filter_input *p
+ = new_input(tinfo, DOM_INPUT_XMLREADER);
+ p->u.xmlreader.split_level = 0;
+ p->u.xmlreader.reader = 0;
+ }
return ZEBRA_OK;
}
/* DOM filter style indexing */
static int attr_content_xml(struct _xmlAttr *attr, const char *name,
- xmlChar **dst_content)
+ const char **dst_content)
{
if (0 == XML_STRCMP(attr->name, name) && attr->children
&& attr->children->type == XML_TEXT_NODE)
{
- *dst_content = (attr->children->content);
+ *dst_content = (const char *) (attr->children->content);
return 1;
}
return 0;
struct recExtractCtrl *extctr,
RecWord* recword,
xmlNodePtr node,
- xmlChar * index_p)
+ const char *index_p)
{
- xmlChar *text = xmlNodeGetContent(node);
- size_t text_len = strlen((const char *)text);
-
-
- /* if there is no text, we do not need to proceed */
- if (text_len)
- {
- xmlChar *look = index_p;
- xmlChar *bval;
- xmlChar *eval;
-
- xmlChar index[256];
- xmlChar type[256];
+ if (tinfo->record_info_invoked == 1)
+ {
+ xmlChar *text = xmlNodeGetContent(node);
+ size_t text_len = strlen((const char *)text);
+
+ /* if there is no text, we do not need to proceed */
+ if (text_len)
+ {
+ const char *look = index_p;
+ const char *bval;
+ const char *eval;
+
+ xmlChar index[256];
+ xmlChar type[256];
- /* assingning text to be indexed */
- recword->term_buf = (const char *)text;
- recword->term_len = text_len;
+ /* assingning text to be indexed */
+ recword->term_buf = (const char *)text;
+ recword->term_len = text_len;
- /* parsing all index name/type pairs */
- /* may not start with ' ' or ':' */
- while (*look && ' ' != *look && ':' != *look)
- {
- /* setting name and type to zero */
- *index = '\0';
- *type = '\0';
-
- /* parsing one index name */
- bval = look;
- while (*look && ':' != *look && ' ' != *look)
+ /* parsing all index name/type pairs */
+ /* may not start with ' ' or ':' */
+ while (*look && ' ' != *look && ':' != *look)
{
- look++;
- }
- eval = look;
- strncpy((char *)index, (const char *)bval, eval - bval);
- index[eval - bval] = '\0';
+ /* setting name and type to zero */
+ *index = '\0';
+ *type = '\0';
-
- /* parsing one index type, if existing */
- if (':' == *look)
- {
- look++;
-
+ /* parsing one index name */
bval = look;
- while (*look && ' ' != *look)
+ while (*look && ':' != *look && ' ' != *look)
{
look++;
}
eval = look;
- strncpy((char *)type, (const char *)bval, eval - bval);
- type[eval - bval] = '\0';
- }
+ strncpy((char *)index, (const char *)bval, eval - bval);
+ index[eval - bval] = '\0';
+
+
+ /* parsing one index type, if existing */
+ if (':' == *look)
+ {
+ look++;
+
+ bval = look;
+ while (*look && ' ' != *look)
+ {
+ look++;
+ }
+ eval = look;
+ strncpy((char *)type, (const char *)bval, eval - bval);
+ type[eval - bval] = '\0';
+ }
- /* actually indexing the text given */
- yaz_log(YLOG_DEBUG, "%s dom filter: "
- "INDEX '%s:%s' '%s'",
- tinfo->fname, index, type, text);
+ /* actually indexing the text given */
- recword->index_name = (const char *)index;
- if (type && *type)
- recword->index_type = *type;
- (extctr->tokenAdd)(recword);
+ recword->index_name = (const char *)index;
+ if (type && *type)
+ recword->index_type = *type;
- /* eat whitespaces */
- if (*look && ' ' == *look && *(look+1))
- {
- look++;
- }
+ /* writing debug out */
+ if (extctr->flagShowRecords)
+ dom_log(YLOG_LOG, tinfo, 0,
+ "INDEX '%s:%s' '%s'",
+ index ? (const char *) index : "null",
+ type ? (const char *) type : "null",
+ text ? (const char *) text : "null");
+
+ /* actually indexing the text given */
+ recword->index_name = (const char *)index;
+ if (type && *type)
+ recword->index_type = *type;
+ (extctr->tokenAdd)(recword);
+
+ /* eat whitespaces */
+ if (*look && ' ' == *look)
+ {
+ look++;
+ }
+ }
}
+ xmlFree(text);
}
-
- xmlFree(text);
}
/* DOM filter style indexing */
static void set_record_info(struct filter_info *tinfo,
struct recExtractCtrl *extctr,
- xmlChar * id_p,
- xmlChar * rank_p,
- xmlChar * type_p)
+ xmlNodePtr node,
+ const char * id_p,
+ const char * rank_p,
+ const char * type_p)
{
- yaz_log(YLOG_DEBUG, "%s dom filter: "
- "RECORD id=%s rank=%s type=%s",
- tinfo->fname, id_p, rank_p, type_p);
+ /* writing debug info out */
+ if (extctr && extctr->flagShowRecords)
+ dom_log(YLOG_LOG, tinfo, node,
+ "RECORD id=%s rank=%s type=%s",
+ id_p ? (const char *) id_p : "(null)",
+ rank_p ? (const char *) rank_p : "(null)",
+ type_p ? (const char *) type_p : "(null)");
- if (id_p)
+
+ if (id_p && *id_p)
sscanf((const char *)id_p, "%255s", extctr->match_criteria);
- if (rank_p)
+ if (rank_p && *rank_p)
extctr->staticrank = atozint((const char *)rank_p);
- /* if (!strcmp("update", type_str)) */
- /* index_node(tinfo, ctrl, ptr, recword); */
- /* else if (!strcmp("delete", type_str)) */
- /* yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
- /* else */
- /* yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", */
- /* type_str); */
+ if (type_p && *type_p)
+ {
+ enum zebra_recctrl_action_t action = action_update;
+ if (!strcmp(type_p, "insert"))
+ action = action_insert;
+ else if (!strcmp(type_p, "delete"))
+ action = action_delete;
+ else if (!strcmp(type_p, "replace"))
+ action = action_replace;
+ else if (!strcmp(type_p, "update"))
+ action = action_update;
+ else
+ dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
+ extctr->action = action;
+ yaz_log(YLOG_LOG, "In mod_dom.c: setting action to %d", action);
+ }
+
+ if (tinfo->record_info_invoked == 1)
+ {
+ /* warn about multiple only once */
+ dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
+ }
+ tinfo->record_info_invoked++;
}
{
if (0 == XML_STRCMP(node->name, "index"))
{
- xmlChar *index_p = 0;
+ const char *index_p = 0;
struct _xmlAttr *attr;
for (attr = node->properties; attr; attr = attr->next)
{
if (attr_content_xml(attr, "name", &index_p))
{
- index_value_of(tinfo, extctr, recword,node, index_p);
+ index_value_of(tinfo, extctr, recword, node, index_p);
}
else
{
- xmlChar *node_path = xmlGetNodePath(node);
- yaz_log(YLOG_WARN,"%s dom filter: "
- "%s bad attribute @%s, expected @name",
- tinfo->fname, node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, node,
+ "bad attribute @%s, expected @name",
+ attr->name);
}
}
}
else if (0 == XML_STRCMP(node->name, "record"))
{
- xmlChar *id_p = 0;
- xmlChar *rank_p = 0;
- xmlChar *type_p = 0;
+ const char *id_p = 0;
+ const char *rank_p = 0;
+ const char *type_p = 0;
struct _xmlAttr *attr;
for (attr = node->properties; attr; attr = attr->next)
;
else
{
- xmlChar *node_path = xmlGetNodePath(node);
- yaz_log(YLOG_WARN,"%s dom filter: "
- "%s bad attribute @%s,"
- " expected @id|@rank|@type",
- tinfo->fname, node_path, attr->name);
- xmlFree(node_path);
- }
-
- if (type_p && 0 != strcmp("update", (const char *)type_p))
- {
- xmlChar *node_path = xmlGetNodePath(node);
- yaz_log(YLOG_WARN,"%s dom filter: "
- "%s attribute @%s,"
- " only implemented '@type='update'",
- tinfo->fname, node_path, attr->name);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, node,
+ "bad attribute @%s, expected @id|@rank|@type",
+ attr->name);
}
-
-
}
- set_record_info(tinfo, extctr, id_p, rank_p, type_p);
+ set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
}
else
{
- xmlChar *node_path = xmlGetNodePath(node);
- yaz_log(YLOG_WARN,"%s dom filter: "
- "%s bad element <%s>,"
+ dom_log(YLOG_WARN, tinfo, node,
+ "bad element <%s>,"
" expected <record>|<index> in namespace '%s'",
- tinfo->fname, node_path,
node->name, zebra_dom_ns);
- xmlFree(node_path);
}
}
}
+static int attr_content_pi(const char **c_ptr, const char *name,
+ char *value, size_t value_max)
+{
+ size_t name_len = strlen(name);
+ const char *look = *c_ptr;
+ int ret = 0;
+
+ *value = '\0';
+ while (*look && ' ' == *look)
+ look++;
+ if (strlen(look) > name_len)
+ {
+ if (look[name_len] == '=' && !memcmp(look, name, name_len))
+ {
+ size_t i = 0;
+ look += name_len+1;
+ while (*look && ' ' != *look)
+ {
+ if (i < value_max-1)
+ value[i++] = *look;
+ look++;
+ }
+ value[i] = '\0';
+ ret = 1;
+ }
+ }
+ while (*look && ' ' == *look)
+ look++;
+ *c_ptr = look;
+ return ret;
+}
/* DOM filter style indexing */
static void process_xml_pi_node(struct filter_info *tinfo,
struct recExtractCtrl *extctr,
xmlNodePtr node,
- xmlChar **index_pp)
+ const char **index_pp)
{
/* if right PI name, continue parsing PI */
if (0 == strcmp(zebra_pi_name, (const char *)node->name))
{
xmlChar *pi_p = node->content;
- xmlChar *look = pi_p;
+ const char *look = (const char *) node->content;
- xmlChar *bval;
- xmlChar *eval;
-
/* parsing PI record instructions */
if (0 == strncmp((const char *)look, "record", 6))
{
- xmlChar id[256];
- xmlChar rank[256];
- xmlChar type[256];
-
+ char id[256];
+ char rank[256];
+ char type[256];
+
*id = '\0';
*rank = '\0';
*type = '\0';
-
look += 6;
-
- /* eat whitespace */
- while (*look && ' ' == *look && *(look+1))
- look++;
-
- /* parse possible id */
- if (*look && 0 == strncmp((const char *)look, "id=", 3))
- {
- look += 3;
- bval = look;
- while (*look && ' ' != *look)
- look++;
- eval = look;
- strncpy((char *)id, (const char *)bval, eval - bval);
- id[eval - bval] = '\0';
- }
-
- /* eat whitespace */
- while (*look && ' ' == *look && *(look+1))
- look++;
-
- /* parse possible rank */
- if (*look && 0 == strncmp((const char *)look, "rank=", 5))
- {
- look += 6;
- bval = look;
- while (*look && ' ' != *look)
- look++;
- eval = look;
- strncpy((char *)rank, (const char *)bval, eval - bval);
- rank[eval - bval] = '\0';
- }
-
- /* eat whitespace */
- while (*look && ' ' == *look && *(look+1))
- look++;
-
- if (look && '\0' != *look)
- {
- xmlChar *node_path = xmlGetNodePath(node);
- yaz_log(YLOG_WARN,"%s dom filter: "
- "%s content '%s', can not parse '%s'",
- tinfo->fname, node_path, pi_p, look);
- xmlFree(node_path);
- }
- else
- set_record_info(tinfo, extctr, id, rank, 0);
-
+ while (*look)
+ if (attr_content_pi(&look, "id", id, sizeof(id)))
+ ;
+ else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
+ ;
+ else if (attr_content_pi(&look, "type", type, sizeof(type)))
+ {
+ dom_log(YLOG_WARN, tinfo, node,
+ "content '%s', can not parse '%s'",
+ pi_p, look);
+ break;
+ }
+ set_record_info(tinfo, extctr, node, id, rank, type);
}
/* parsing index instruction */
else if (0 == strncmp((const char *)look, "index", 5))
look += 5;
/* eat whitespace */
- while (*look && ' ' == *look && *(look+1))
+ while (*look && ' ' == *look)
look++;
/* export index instructions to outside */
}
else
{
- xmlChar *node_path = xmlGetNodePath(node);
- yaz_log(YLOG_WARN,"%s dom filter: "
- "%s content '%s', can not parse '%s'",
- tinfo->fname, node_path, pi_p, look);
- xmlFree(node_path);
+ dom_log(YLOG_WARN, tinfo, node,
+ "content '%s', can not parse '%s'",
+ pi_p, look);
}
}
}
xmlNodePtr node)
{
/* remember indexing instruction from PI to next element node */
- xmlChar *index_p = 0;
+ const char *index_p = 0;
/* check if we are an element node in the special zebra namespace
and either set record data or index value-of node content*/
struct recExtractCtrl *extctr,
xmlDocPtr doc)
{
- xmlChar *buf_out;
- int len_out;
-
/* only need to do the initialization once, reuse recword for all terms */
RecWord recword;
(*extctr->init)(extctr, &recword);
- if (extctr->flagShowRecords)
- {
- xmlDocDumpMemory(doc, &buf_out, &len_out);
- fwrite(buf_out, len_out, 1, stdout);
- xmlFree(buf_out);
- }
-
process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
}
xsltStylesheetPtr last_xsp = 0;
xmlDocPtr store_doc = 0;
+ /* per default do not ingest record */
+ tinfo->record_info_invoked = 0;
+
+ /* exit if empty document given */
+ if (!doc)
+ return RECCTRL_EXTRACT_SKIP;
+
+ /* we actuallu have a document which needs to be processed further */
params[0] = 0;
set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
+ if (p && p->flagShowRecords)
+ {
+ xmlChar *buf_out;
+ int len_out;
+#if 0
+ FILE *outf = fopen("extract.xml", "w");
+ xmlDocDumpMemory(doc, &buf_out, &len_out);
+ fwrite(buf_out, 1, len_out, outf);
+#endif
+ yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
+#if 0
+ fclose(outf);
+#endif
+ }
+
/* input conversion */
- perform_convert(tinfo, input->convert, params, &doc, 0);
+ perform_convert(tinfo, p, input->convert, params, &doc, 0);
+
if (tinfo->store)
{
/* store conversion */
store_doc = xmlCopyDoc(doc, 1);
- perform_convert(tinfo, tinfo->store->convert,
+ perform_convert(tinfo, p, tinfo->store->convert,
params, &store_doc, &last_xsp);
}
+ /* saving either store doc or original doc in case no store doc exists */
if (last_xsp)
xsltSaveResultToString(&buf_out, &len_out,
store_doc ? store_doc : doc, last_xsp);
else
xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
- if (p->flagShowRecords)
- fwrite(buf_out, len_out, 1, stdout);
- (*p->setStoreData)(p, buf_out, len_out);
+
+ if (p->setStoreData)
+ (*p->setStoreData)(p, buf_out, len_out);
xmlFree(buf_out);
if (store_doc)
xmlFreeDoc(store_doc);
/* extract conversion */
- perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
+ perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
+
/* finally, do the indexing */
- if (doc)
- {
+ if (doc){
extract_dom_doc_node(tinfo, p, doc);
- /* extract_doc_alvis(tinfo, p, doc); */
xmlFreeDoc(doc);
}
+
+ /* there was nothing to index, so there is no inserted/updated record */
+ if (tinfo->record_info_invoked == 0)
+ return RECCTRL_EXTRACT_SKIP;
return RECCTRL_EXTRACT_OK;
}
p /* I/O handler */,
0 /* URL */,
0 /* encoding */,
- XML_PARSE_XINCLUDE|
- XML_PARSE_NOENT);
+ XML_PARSE_XINCLUDE
+ | XML_PARSE_NOENT
+ | XML_PARSE_NONET);
}
if (!input->u.xmlreader.reader)
return RECCTRL_EXTRACT_ERROR_GENERIC;
{
int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
+
if (type == XML_READER_TYPE_ELEMENT &&
input->u.xmlreader.split_level == depth)
{
- xmlNodePtr ptr
- = xmlTextReaderExpand(input->u.xmlreader.reader);
+ xmlNodePtr ptr;
+
+ /* per default do not ingest record */
+ tinfo->record_info_invoked = 0;
+
+ ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
if (ptr)
- {
+ {
+ /* we have a new document */
+
xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
xmlDocSetRootElement(doc, ptr2);
+ /* writing debug info out */
+ if (p->flagShowRecords)
+ {
+ xmlChar *buf_out = 0;
+ int len_out = 0;
+ xmlDocDumpMemory(doc, &buf_out, &len_out);
+ yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
+ tinfo->fname ? tinfo->fname : "(none)",
+ depth, len_out, buf_out);
+ xmlFree(buf_out);
+ }
+
return convert_extract_doc(tinfo, input, p, doc);
}
else
p /* I/O handler */,
0 /* URL */,
0 /* encoding */,
- XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
+ XML_PARSE_XINCLUDE
+ | XML_PARSE_NOENT
+ | XML_PARSE_NONET);
if (!doc)
{
return RECCTRL_EXTRACT_ERROR_GENERIC;
{
int i;
- yaz_log(YLOG_WARN, "%s dom filter: "
+ dom_log(YLOG_WARN, tinfo, 0,
"MARC: Skipping bad byte %d (0x%02X)",
- tinfo->fname, *buf & 0xff, *buf & 0xff);
+ *buf & 0xff, *buf & 0xff);
for (i = 0; i<4; i++)
buf[i] = buf[i+1];
record_length = atoi_n (buf, 5);
if (record_length < 25)
{
- yaz_log (YLOG_WARN, "%s dom filter: "
- "MARC record length < 25, is %d",
- tinfo->fname, record_length);
+ dom_log(YLOG_WARN, tinfo, 0,
+ "MARC record length < 25, is %d", record_length);
return RECCTRL_EXTRACT_ERROR_GENERIC;
}
read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
if (read_bytes < record_length-5)
{
- yaz_log (YLOG_WARN, "%s dom filter: "
- "Couldn't read whole MARC record",
- tinfo->fname);
+ dom_log(YLOG_WARN, tinfo, 0,
+ "couldn't read whole MARC record");
return RECCTRL_EXTRACT_ERROR_GENERIC;
}
r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
if (r < record_length)
{
- yaz_log (YLOG_WARN, "%s dom filter: "
- "Parsing of MARC record failed r=%d length=%d",
- tinfo->fname, r, record_length);
+ dom_log (YLOG_WARN, tinfo, 0,
+ "parsing of MARC record failed r=%d length=%d",
+ r, record_length);
return RECCTRL_EXTRACT_ERROR_GENERIC;
}
else
{
xmlDocPtr rdoc;
xmlNode *root_ptr;
- yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
+ yaz_marc_write_xml(input->u.marc.handle, &root_ptr,
+ "http://www.loc.gov/MARC21/slim", 0, 0);
rdoc = xmlNewDoc((const xmlChar*) "1.0");
xmlDocSetRootElement(rdoc, root_ptr);
return convert_extract_doc(tinfo, input, p, rdoc);
if (!input)
return RECCTRL_EXTRACT_ERROR_GENERIC;
-
+
odr_reset(tinfo->odr_record);
+
+ if (p->setStoreData == 0)
+ return extract_xml_full(tinfo, input, p);
switch(input->type)
{
case DOM_INPUT_XMLREADER:
{
p->diagnostic =
YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
+ p->addinfo = odr_strdup(p->odr, esn);
return 0;
}
doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
0 /* URL */,
0 /* encoding */,
- XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
+ XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
if (!doc)
{
p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
}
/* retrieve conversion */
- perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
+ perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
if (!doc)
{
p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
}
- else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
+ else if (!p->input_format
+ || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
{
xmlChar *buf_out;
int len_out;
else
xmlDocDumpMemory(doc, &buf_out, &len_out);
- p->output_format = VAL_TEXT_XML;
+ p->output_format = yaz_oid_recsyn_xml;
p->rec_len = len_out;
p->rec_buf = odr_malloc(p->odr, p->rec_len);
memcpy(p->rec_buf, buf_out, p->rec_len);
xmlFree(buf_out);
}
- else if (p->output_format == VAL_SUTRS)
+ else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
{
xmlChar *buf_out;
int len_out;
else
xmlDocDumpMemory(doc, &buf_out, &len_out);
- p->output_format = VAL_SUTRS;
+ p->output_format = yaz_oid_recsyn_sutrs;
p->rec_len = len_out;
p->rec_buf = odr_malloc(p->odr, p->rec_len);
memcpy(p->rec_buf, buf_out, p->rec_len);