First indexing using index_types system (ICU).

[idzebra-moved-to-github.git] / index / mod_dom.c
diff --git a/index/mod_dom.c b/index/mod_dom.c

index 4457c95..4d8b02d 100644 (file)
--- a/index/mod_dom.c
+++ b/index/mod_dom.c
@@ -1,4 +1,4 @@
-/* $Id: mod_dom.c,v 1.16 2007-02-18 21:53:22 adam Exp $
+/* $Id: mod_dom.c,v 1.40 2007-10-21 19:39:00 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -23,9 +23,11 @@
  #include <stdio.h>
  #include <assert.h>
  #include <ctype.h>
+#include <stdarg.h>
  
  #include <yaz/diagbib1.h>
  #include <yaz/tpath.h>
+#include <yaz/snprintf.h>
  
  #include <libxml/xmlversion.h>
  #include <libxml/parser.h>
@@ -41,6 +43,7 @@
  
  #include <idzebra/util.h>
  #include <idzebra/recctrl.h>
+#include <yaz/oid_db.h>
  
  /* DOM filter style indexing */
  #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
@@ -83,14 +86,14 @@ struct filter_input {
      int type;
      union {
          struct {
+            xmlTextReaderPtr reader;
+            int split_level;
+        } xmlreader;
+        struct {
              const char *input_charset;
              yaz_marc_t handle;
              yaz_iconv_t iconv;
          } marc;
-        struct {
-            xmlTextReaderPtr reader;
-            int split_level;
-        } xmlreader;
      } u;
      struct filter_input *next;
  };
@@ -106,12 +109,43 @@ struct filter_info {
      struct filter_retrieve *retrieve_list;
      struct filter_input *input_list;
      struct filter_store *store;
+    int record_info_invoked;
  };
  
+
+
  #define XML_STRCMP(a,b)   strcmp((char*)a, b)
  #define XML_STRLEN(a) strlen((char*)a)
  
  
+#define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
+
+static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
+                    const char *fmt, ...)
+#ifdef __GNUC__
+    __attribute__ ((format (printf, 4, 5)))
+#endif
+    ;
+
+static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
+                    const char *fmt, ...)
+{
+    va_list ap;
+    char buf[4096];
+
+    va_start(ap, fmt);
+    yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
+    if (ptr)
+    {
+        yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none", 
+                xmlGetLineNo(ptr), buf);
+    }
+    else
+    {
+        yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
+    }
+    va_end(ap);
+}
  
  
  static void set_param_str(const char **params, const char *name,
@@ -151,6 +185,7 @@ static void *filter_init(Res res, RecType recType)
      tinfo->input_list = 0;
      tinfo->store = 0;
      tinfo->doc_config = 0;
+    tinfo->record_info_invoked = 0;
  
  #if YAZ_HAVE_EXSLT
      exsltRegisterAll(); 
@@ -233,10 +268,7 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
                                 struct convert_s **l)
  {
      *l = 0;
-    for(; ptr; ptr = ptr->next)
-    {
-        if (ptr->type != XML_ELEMENT_NODE)
-            continue;
+    FOR_EACH_ELEMENT(ptr) {
          if (!XML_STRCMP(ptr->name, "xslt"))
          {
              struct _xmlAttr *attr;
@@ -252,13 +284,8 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(ptr);
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "%s bad attribute @%s, "
-                            "expected @stylesheet",
-                            tinfo->fname, 
-                            node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "bad attribute @%s", attr->name);
                  }
              if (p->stylesheet)
              {
@@ -268,10 +295,9 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
                                            NULL, 
                                            tmp_xslt_full_name))
                  {
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
+                    dom_log(YLOG_WARN, tinfo, 0,
                              "stylesheet %s not found in "
                              "path %s",
-                            tinfo->fname,
                              p->stylesheet, 
                              tinfo->profile_path);
                      return ZEBRA_FAIL;
@@ -282,33 +308,25 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
                                                tmp_xslt_full_name);
                  if (!p->stylesheet_xsp)
                  {
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "could not parse xslt "
-                            "stylesheet %s",
-                            tinfo->fname, tmp_xslt_full_name);
+                    dom_log(YLOG_WARN, tinfo, 0,
+                            "could not parse xslt stylesheet %s",
+                            tmp_xslt_full_name);
                      return ZEBRA_FAIL;
                  }
-            }
-            else
-            {
-                xmlChar *node_path = xmlGetNodePath(ptr);
-                yaz_log(YLOG_WARN, "%s: dom filter: "
-                        "%s missing attribute 'stylesheet' ", 
-                        tinfo->fname, node_path);
-                xmlFree(node_path);
-                return ZEBRA_FAIL;
-            }
-            *l = p;
-            l = &p->next;
+                }
+                else
+                {
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "missing attribute 'stylesheet' ");
+                    return ZEBRA_FAIL;
+                }
+                *l = p;
+                l = &p->next;
          }
          else
          {
-            xmlChar *node_path = xmlGetNodePath(ptr);
-            yaz_log(YLOG_LOG, 
-                    "%s: dom filter: "
-                    "%s bad node '%s'",
-                    tinfo->fname, node_path, ptr->name);
-            xmlFree(node_path);
+            dom_log(YLOG_WARN, tinfo, ptr,
+                    "bad element '%s', expected <xslt>", ptr->name);
              return ZEBRA_FAIL;
          }
      }
@@ -316,6 +334,7 @@ static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
  }
  
  static ZEBRA_RES perform_convert(struct filter_info *tinfo, 
+                                 struct recExtractCtrl *extctr,
                                   struct convert_s *convert,
                                   const char **params,
                                   xmlDocPtr *doc,
@@ -323,12 +342,34 @@ static ZEBRA_RES perform_convert(struct filter_info *tinfo,
  {
      for (; convert; convert = convert->next)
      {
+        xmlChar *buf_out = 0;
+        int len_out = 0;
          xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
                                                  *doc, params);
          if (last_xsp)
              *last_xsp = convert->stylesheet_xsp;
+        
+        if (!res_doc)
+            break;
+
+        /* now saving into buffer and re-reading into DOM to avoid annoing
+           XSLT problem with thrown-out indentation text nodes */
+        xsltSaveResultToString(&buf_out, &len_out, res_doc,
+                               convert->stylesheet_xsp); 
+        xmlFreeDoc(res_doc);
+
          xmlFreeDoc(*doc);
-        *doc = res_doc;
+
+        *doc = xmlParseMemory((const char *) buf_out, len_out);
+
+        /* writing debug info out */
+        if (extctr && extctr->flagShowRecords)
+            yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s", 
+                    tinfo->fname ? tinfo->fname : "(none)", 
+                    convert->stylesheet,
+                    len_out, buf_out);
+        
+        xmlFree(buf_out);
      }
      return ZEBRA_OK;
  }
@@ -349,13 +390,9 @@ static struct filter_input *new_input(struct filter_info *tinfo, int type)
  }
  
  static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
-                             const char *syntax,
-                             const char *name)
+                             const char *syntax, const char *name)
  {
-    for (; ptr; ptr = ptr->next)
-    {
-        if (ptr->type != XML_ELEMENT_NODE)
-            continue;
+    FOR_EACH_ELEMENT(ptr) {
          if (!XML_STRCMP(ptr->name, "marc"))
          {
              yaz_iconv_t iconv = 0;
@@ -364,28 +401,20 @@ static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
              
              for (attr = ptr->properties; attr; attr = attr->next)
              {
-                if (attr_content(attr, "charset", &input_charset))
+                if (attr_content(attr, "inputcharset", &input_charset))
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(ptr);
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "%s bad attribute @%s,"
-                            " expected @charset",
-                            tinfo->fname, 
-                            node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "bad attribute @%s, expected @inputcharset",
+                            attr->name);
                  }
              }
              iconv = yaz_iconv_open("utf-8", input_charset);
              if (!iconv)
              {
-                xmlChar *node_path = xmlGetNodePath(ptr);
-                yaz_log(YLOG_WARN, "%s: dom filter: "
-                        "%s unsupported @charset '%s'", 
-                        tinfo->fname, node_path,
-                        input_charset);
-                xmlFree(node_path);
+                dom_log(YLOG_WARN, tinfo, ptr, 
+                        "unsupported @charset '%s'", input_charset);
                  return ZEBRA_FAIL;
              }
              else
@@ -420,13 +449,9 @@ static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(ptr);
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "%s bad attribute @%s,"
-                            " expected @level",
-                            tinfo->fname, node_path,
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "bad attribute @%s, expected @level",
                              attr->name);
-                    xmlFree(node_path);
                  }
              }
              if (level_str)
@@ -439,12 +464,9 @@ static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
          }
          else
          {
-            xmlChar *node_path = xmlGetNodePath(ptr);
-            yaz_log(YLOG_WARN, "%s: dom filter: "
-                    "%s bad element <%s>,"
-                    " expected <marc>|<xmlreader>",
-                    tinfo->fname, node_path, ptr->name);
-            xmlFree(node_path);
+            dom_log(YLOG_WARN, tinfo, ptr,
+                    "bad element <%s>, expected <marc>|<xmlreader>",
+                    ptr->name);
              return ZEBRA_FAIL;
          }
      }
@@ -467,7 +489,7 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
      
      yaz_log(YLOG_LOG, "%s dom filter: "
              "loading config file %s", tinfo->fname, tinfo->full_name);
-    
+
      doc = xmlParseFile(tinfo->full_name);
      if (!doc)
      {
@@ -483,19 +505,14 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
      if (!ptr || ptr->type != XML_ELEMENT_NODE 
          || XML_STRCMP(ptr->name, "dom"))
      {
-        xmlChar *node_path = xmlGetNodePath(ptr);
-        yaz_log(YLOG_WARN, "%s: dom filter: "
-                "%s bad root element <%s>,"
-                " expected root element <dom>", 
-                tinfo->fname, node_path, ptr->name);  
-        xmlFree(node_path);
+        dom_log(YLOG_WARN, tinfo, ptr,
+                "bad root element <%s>, expected root element <dom>", 
+                ptr->name);  
          return ZEBRA_FAIL;
      }
  
-    for (ptr = ptr->children; ptr; ptr = ptr->next)
-    {
-        if (ptr->type != XML_ELEMENT_NODE)
-            continue;
+    ptr = ptr->children;
+    FOR_EACH_ELEMENT(ptr) {
          if (!XML_STRCMP(ptr->name, "extract"))
          {
              /*
@@ -517,13 +534,9 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(ptr);
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "%s bad attribute @%s"
-                            " expected @name",
-                            tinfo->fname, 
-                            node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "bad attribute @%s, expected @name",
+                            attr->name);
                  }
              }
              parse_convert(tinfo, ptr->children, &f->convert);
@@ -559,13 +572,9 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(ptr);
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "%s bad attribute @%s"
-                            " expected @identifier|@name",
-                            tinfo->fname, 
-                            node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "bad attribute @%s,  expected @identifier|@name",
+                            attr->name);
                  }
              }
              parse_convert(tinfo, ptr->children, &f->convert);
@@ -606,28 +615,29 @@ static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(ptr);
-                    yaz_log(YLOG_WARN, "%s: dom filter: "
-                            "%s bad attribute @%s"
-                            " expected @syntax|@name",
-                            tinfo->fname, 
-                            node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, ptr,
+                            "bad attribute @%s,  expected @syntax|@name",
+                            attr->name);
                  }
              }
              parse_input(tinfo, ptr->children, syntax, name);
          }
          else
          {
-            xmlChar *node_path = xmlGetNodePath(ptr);
-            yaz_log(YLOG_WARN, "%s: dom filter: "
-                    "%s bad element <%s>,"
-                    " expected <extract>|<input>|<retrieve>|<store>",
-                    tinfo->fname, node_path, ptr->name);
-            xmlFree(node_path);
+            dom_log(YLOG_WARN, tinfo, ptr,
+                    "bad element <%s>, "
+                    "expected <extract>|<input>|<retrieve>|<store>",
+                    ptr->name);
              return ZEBRA_FAIL;
          }
      }
+    if (!tinfo->input_list)
+    {
+        struct filter_input *p 
+            = new_input(tinfo, DOM_INPUT_XMLREADER);
+        p->u.xmlreader.split_level = 0;
+        p->u.xmlreader.reader = 0;
+    }
      return ZEBRA_OK;
  }
  
@@ -694,12 +704,12 @@ static int ioclose_ex(void *context)
  
  /* DOM filter style indexing */
  static int attr_content_xml(struct _xmlAttr *attr, const char *name,
-                            xmlChar **dst_content)
+                            const char **dst_content)
  {
      if (0 == XML_STRCMP(attr->name, name) && attr->children 
          && attr->children->type == XML_TEXT_NODE)
      {
-        *dst_content = (attr->children->content);
+        *dst_content = (const char *) (attr->children->content);
          return 1;
      }
      return 0;
@@ -711,106 +721,139 @@ static void index_value_of(struct filter_info *tinfo,
                             struct recExtractCtrl *extctr,
                             RecWord* recword, 
                             xmlNodePtr node, 
-                           xmlChar * index_p)
+                           const char *index_p)
  {
-    xmlChar *text = xmlNodeGetContent(node);
-    size_t text_len = strlen((const char *)text);
-
-
-    /* if there is no text, we do not need to proceed */
-    if (text_len)
-    {            
-        xmlChar *look = index_p;
-        xmlChar *bval;
-        xmlChar *eval;
-
-        xmlChar index[256];
-        xmlChar type[256];
+    if (tinfo->record_info_invoked == 1)
+    {
+        xmlChar *text = xmlNodeGetContent(node);
+        size_t text_len = strlen((const char *)text);
+       
+        /* if there is no text, we do not need to proceed */
+        if (text_len)
+        {            
+            const char *look = index_p;
+            const char *bval;
+            const char *eval;
+
+            xmlChar index[256];
+            xmlChar type[256];
  
-        /* assingning text to be indexed */
-        recword->term_buf = (const char *)text;
-        recword->term_len = text_len;
+            /* assingning text to be indexed */
+            recword->term_buf = (const char *)text;
+            recword->term_len = text_len;
  
-        /* parsing all index name/type pairs */
-        /* may not start with ' ' or ':' */
-        while (*look && ' ' != *look && ':' != *look)
-        {
-            /* setting name and type to zero */
-            *index = '\0';
-            *type = '\0';
-    
-            /* parsing one index name */
-            bval = look;
-            while (*look && ':' != *look && ' ' != *look)
+            /* parsing all index name/type pairs */
+            /* may not start with ' ' or ':' */
+            while (*look && ' ' != *look && ':' != *look)
              {
-                look++;
-            }
-            eval = look;
-            strncpy((char *)index, (const char *)bval, eval - bval);
-            index[eval - bval] = '\0';
+                /* setting name and type to zero */
+                *index = '\0';
+                *type = '\0';
      
-    
-            /* parsing one index type, if existing */
-            if (':' == *look)
-            {
-                look++;
-      
+                /* parsing one index name */
                  bval = look;
-                while (*look && ' ' != *look)
+                while (*look && ':' != *look && ' ' != *look)
                  {
                      look++;
                  }
                  eval = look;
-                strncpy((char *)type, (const char *)bval, eval - bval);
-                type[eval - bval] = '\0';
-            }
+                strncpy((char *)index, (const char *)bval, eval - bval);
+                index[eval - bval] = '\0';
+    
+    
+                /* parsing one index type, if existing */
+                if (':' == *look)
+                {
+                    look++;
+      
+                    bval = look;
+                    while (*look && ' ' != *look)
+                    {
+                        look++;
+                    }
+                    eval = look;
+                    strncpy((char *)type, (const char *)bval, eval - bval);
+                    type[eval - bval] = '\0';
+                }
  
-            /* actually indexing the text given */
-            yaz_log(YLOG_DEBUG, "%s dom filter: "
-                    "INDEX  '%s:%s' '%s'", 
-                    tinfo->fname, index, type, text);
+                /* actually indexing the text given */
  
-            recword->index_name = (const char *)index;
-            if (type && *type)
-                recword->index_type = *type;
-            (extctr->tokenAdd)(recword);
+                recword->index_name = (const char *)index;
+                if (type && *type)
+                    recword->index_type = *type;
  
-            /* eat whitespaces */
-            if (*look && ' ' == *look && *(look+1))
-            {
-                look++;
-            } 
+                /* writing debug out */
+                if (extctr->flagShowRecords)
+                    dom_log(YLOG_LOG, tinfo, 0, 
+                            "INDEX '%s:%s' '%s'", 
+                            index ? (const char *) index : "null",
+                            type ? (const char *) type : "null", 
+                            text ? (const char *) text : "null");
+                
+                /* actually indexing the text given */
+                recword->index_name = (const char *)index;
+                if (type && *type)
+                    recword->index_type = *type;
+                (extctr->tokenAdd)(recword);
+
+                /* eat whitespaces */
+                if (*look && ' ' == *look)
+                {
+                    look++;
+                } 
+            }
          }
+        xmlFree(text); 
      }
-    
-    xmlFree(text); 
  }
  
  
  /* DOM filter style indexing */
  static void set_record_info(struct filter_info *tinfo, 
                              struct recExtractCtrl *extctr, 
-                            xmlChar * id_p, 
-                            xmlChar * rank_p, 
-                            xmlChar * type_p)
+                            xmlNodePtr node, 
+                            const char * id_p, 
+                            const char * rank_p, 
+                            const char * type_p)
  {
-    yaz_log(YLOG_DEBUG, "%s dom filter: "
-            "RECORD id=%s rank=%s type=%s", 
-            tinfo->fname,  id_p, rank_p, type_p);
+    /* writing debug info out */
+    if (extctr && extctr->flagShowRecords)
+        dom_log(YLOG_LOG, tinfo, node,
+                "RECORD id=%s rank=%s type=%s", 
+                id_p ? (const char *) id_p : "(null)",
+                rank_p ? (const char *) rank_p : "(null)",
+                type_p ? (const char *) type_p : "(null)");
      
-    if (id_p)
+
+    if (id_p && *id_p)
          sscanf((const char *)id_p, "%255s", extctr->match_criteria);
  
-    if (rank_p)
+    if (rank_p && *rank_p)
          extctr->staticrank = atozint((const char *)rank_p);
  
-    /*     if (!strcmp("update", type_str)) */
-    /*         index_node(tinfo, ctrl, ptr, recword); */
-    /*     else if (!strcmp("delete", type_str)) */
-    /*         yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
-    /*     else */
-    /*         yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'",  */
-    /*                 type_str); */
+    if (type_p && *type_p)
+    {
+        enum zebra_recctrl_action_t action = action_update;
+        if (!strcmp(type_p, "insert"))
+            action = action_insert;
+        else if (!strcmp(type_p, "delete"))
+            action = action_delete;
+        else if (!strcmp(type_p, "replace"))
+            action = action_replace;
+        else if (!strcmp(type_p, "update"))
+            action = action_update;
+        else
+            dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
+        extctr->action = action;
+        yaz_log(YLOG_LOG, "In mod_dom.c: setting action to %d", action);
+    }
+
+    if (tinfo->record_info_invoked == 1)
+    {
+        /* warn about multiple only once */
+        dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
+    }
+    tinfo->record_info_invoked++;
  
  }
  
@@ -826,30 +869,28 @@ static void process_xml_element_zebra_node(struct filter_info *tinfo,
      {
           if (0 == XML_STRCMP(node->name, "index"))
           {
-            xmlChar *index_p = 0;
+            const char *index_p = 0;
  
              struct _xmlAttr *attr;      
              for (attr = node->properties; attr; attr = attr->next)
              {
                  if (attr_content_xml(attr, "name", &index_p))
                  {
-                    index_value_of(tinfo, extctr, recword,node, index_p);
+                    index_value_of(tinfo, extctr, recword, node, index_p);
                  }  
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(node);
-                    yaz_log(YLOG_WARN,"%s dom filter: "
-                            "%s bad attribute @%s, expected @name",
-                            tinfo->fname, node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, node,
+                            "bad attribute @%s, expected @name",
+                            attr->name);
                  }
              }
          }
          else if (0 == XML_STRCMP(node->name, "record"))
          {
-            xmlChar *id_p = 0;
-            xmlChar *rank_p = 0;
-            xmlChar *type_p = 0;
+            const char *id_p = 0;
+            const char *rank_p = 0;
+            const char *type_p = 0;
  
              struct _xmlAttr *attr;
              for (attr = node->properties; attr; attr = attr->next)
@@ -862,117 +903,91 @@ static void process_xml_element_zebra_node(struct filter_info *tinfo,
                      ;
                  else
                  {
-                    xmlChar *node_path = xmlGetNodePath(node);
-                    yaz_log(YLOG_WARN,"%s dom filter: "
-                            "%s bad attribute @%s,"
-                            " expected @id|@rank|@type",
-                            tinfo->fname, node_path, attr->name);
-                    xmlFree(node_path);
-                }
-
-                if (type_p && 0 != strcmp("update", (const char *)type_p))
-                {
-                    xmlChar *node_path = xmlGetNodePath(node);
-                    yaz_log(YLOG_WARN,"%s dom filter: "
-                            "%s attribute @%s,"
-                            " only implemented '@type='update'",
-                            tinfo->fname, node_path, attr->name);
-                    xmlFree(node_path);
+                    dom_log(YLOG_WARN, tinfo, node,
+                            "bad attribute @%s, expected @id|@rank|@type",
+                            attr->name);
                  }
-          
-
              }
-            set_record_info(tinfo, extctr, id_p, rank_p, type_p);
+            set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
          } 
          else
          {
-            xmlChar *node_path = xmlGetNodePath(node);
-            yaz_log(YLOG_WARN,"%s dom filter: "
-                    "%s bad element <%s>,"
+            dom_log(YLOG_WARN, tinfo, node,
+                    "bad element <%s>,"
                      " expected <record>|<index> in namespace '%s'",
-                    tinfo->fname, node_path, 
                      node->name, zebra_dom_ns);
-            xmlFree(node_path);
          }
      }
  }
  
+static int attr_content_pi(const char **c_ptr, const char *name,
+                           char *value, size_t value_max)
+{
+    size_t name_len = strlen(name);
+    const char *look = *c_ptr;
+    int ret = 0;
+
+    *value = '\0';
+    while (*look && ' ' == *look)
+        look++;
+    if (strlen(look) > name_len)
+    {
+        if (look[name_len] == '=' && !memcmp(look, name, name_len))
+        {
+            size_t i = 0;
+            look += name_len+1;
+            while (*look && ' ' != *look)
+            {
+                if (i < value_max-1)
+                    value[i++] = *look;
+                look++;
+            }
+            value[i] = '\0';
+            ret = 1;
+        }
+    }
+    while (*look && ' ' == *look)
+        look++;
+    *c_ptr = look;
+    return ret;
+}
  
  /* DOM filter style indexing */
  static void process_xml_pi_node(struct filter_info *tinfo, 
                                  struct recExtractCtrl *extctr, 
                                  xmlNodePtr node,
-                                xmlChar **index_pp)
+                                const char **index_pp)
  {
      /* if right PI name, continue parsing PI */
      if (0 == strcmp(zebra_pi_name, (const char *)node->name))
      {
          xmlChar *pi_p =  node->content;
-        xmlChar *look = pi_p;
+        const char *look = (const char *) node->content;
      
-        xmlChar *bval;
-        xmlChar *eval;
-
          /* parsing PI record instructions */
          if (0 == strncmp((const char *)look, "record", 6))
          {
-            xmlChar id[256];
-            xmlChar rank[256];
-            xmlChar type[256];
-
+            char id[256];
+            char rank[256];
+            char type[256];
+            
              *id = '\0';
              *rank = '\0';
              *type = '\0';
-      
              look += 6;
-      
-            /* eat whitespace */
-            while (*look && ' ' == *look && *(look+1))
-                look++;
-
-            /* parse possible id */
-            if (*look && 0 == strncmp((const char *)look, "id=", 3))
-            {
-                look += 3;
-                bval = look;
-                while (*look && ' ' != *look)
-                    look++;
-                eval = look;
-                strncpy((char *)id, (const char *)bval, eval - bval);
-                id[eval - bval] = '\0';
-            }
-      
-            /* eat whitespace */
-            while (*look && ' ' == *look && *(look+1))
-                look++;
-      
-            /* parse possible rank */
-            if (*look && 0 == strncmp((const char *)look, "rank=", 5))
-            {
-                look += 6;
-                bval = look;
-                while (*look && ' ' != *look)
-                    look++;
-                eval = look;
-                strncpy((char *)rank, (const char *)bval, eval - bval);
-                rank[eval - bval] = '\0';
-            }
-
-            /* eat whitespace */
-            while (*look && ' ' == *look && *(look+1))
-                look++;
-
-            if (look && '\0' != *look)
-            {
-                xmlChar *node_path = xmlGetNodePath(node);
-                yaz_log(YLOG_WARN,"%s dom filter: "
-                        "%s content '%s', can not parse '%s'",
-                        tinfo->fname, node_path, pi_p, look);
-                xmlFree(node_path);
-            }
-            else 
-                set_record_info(tinfo, extctr, id, rank, 0);
-
+            while (*look)
+                if (attr_content_pi(&look, "id", id, sizeof(id)))
+                    ;
+                else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
+                    ;
+                else if (attr_content_pi(&look, "type", type, sizeof(type)))
+                {
+                    dom_log(YLOG_WARN, tinfo, node,
+                            "content '%s', can not parse '%s'",
+                            pi_p, look);
+                    break;
+                }
+            set_record_info(tinfo, extctr, node, id, rank, type);
          } 
          /* parsing index instruction */
          else if (0 == strncmp((const char *)look, "index", 5))
@@ -980,7 +995,7 @@ static void process_xml_pi_node(struct filter_info *tinfo,
              look += 5;
        
              /* eat whitespace */
-            while (*look && ' ' == *look && *(look+1))
+            while (*look && ' ' == *look)
                  look++;
  
              /* export index instructions to outside */
@@ -988,11 +1003,9 @@ static void process_xml_pi_node(struct filter_info *tinfo,
          } 
          else 
          {
-            xmlChar *node_path = xmlGetNodePath(node);
-            yaz_log(YLOG_WARN,"%s dom filter: "
-                    "%s content '%s', can not parse '%s'",
-                    tinfo->fname, node_path, pi_p, look);
-            xmlFree(node_path);
+            dom_log(YLOG_WARN, tinfo, node,
+                    "content '%s', can not parse '%s'",
+                    pi_p, look);
          }
      }
  }
@@ -1004,7 +1017,7 @@ static void process_xml_element_node(struct filter_info *tinfo,
                                       xmlNodePtr node)
  {
      /* remember indexing instruction from PI to next element node */
-    xmlChar *index_p = 0;
+    const char *index_p = 0;
  
      /* check if we are an element node in the special zebra namespace 
         and either set record data or index value-of node content*/
@@ -1039,20 +1052,10 @@ static void extract_dom_doc_node(struct filter_info *tinfo,
                                   struct recExtractCtrl *extctr, 
                                   xmlDocPtr doc)
  {
-    xmlChar *buf_out;
-    int len_out;
-
      /* only need to do the initialization once, reuse recword for all terms */
      RecWord recword;
      (*extctr->init)(extctr, &recword);
  
-    if (extctr->flagShowRecords)
-    {
-        xmlDocDumpMemory(doc, &buf_out, &len_out);
-        fwrite(buf_out, len_out, 1, stdout);
-        xmlFree(buf_out);
-    }
-
      process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
  }
  
@@ -1071,43 +1074,71 @@ static int convert_extract_doc(struct filter_info *tinfo,
      xsltStylesheetPtr last_xsp = 0;
      xmlDocPtr store_doc = 0;
  
+    /* per default do not ingest record */
+    tinfo->record_info_invoked = 0;
+
+    /* exit if empty document given */
+    if (!doc)
+        return RECCTRL_EXTRACT_SKIP;
+
+    /* we actuallu have a document which needs to be processed further */
      params[0] = 0;
      set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
  
+    if (p && p->flagShowRecords)
+    {
+        xmlChar *buf_out;
+        int len_out;
+#if 0 
+        FILE *outf = fopen("extract.xml", "w");
+        xmlDocDumpMemory(doc, &buf_out, &len_out);
+        fwrite(buf_out, 1, len_out, outf);
+#endif
+        yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
+#if 0
+        fclose(outf);
+#endif
+    }
+
      /* input conversion */
-    perform_convert(tinfo, input->convert, params, &doc, 0);
+    perform_convert(tinfo, p, input->convert, params, &doc, 0);
+
  
      if (tinfo->store)
      {
          /* store conversion */
          store_doc = xmlCopyDoc(doc, 1);
-        perform_convert(tinfo, tinfo->store->convert,
+        perform_convert(tinfo, p, tinfo->store->convert,
                          params, &store_doc, &last_xsp);
      }
      
+    /* saving either store doc or original doc in case no store doc exists */
      if (last_xsp)
          xsltSaveResultToString(&buf_out, &len_out, 
                                 store_doc ? store_doc : doc, last_xsp);
      else
          xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
-    if (p->flagShowRecords)
-       fwrite(buf_out, len_out, 1, stdout);
-    (*p->setStoreData)(p, buf_out, len_out);
+
+    if (p->setStoreData)
+        (*p->setStoreData)(p, buf_out, len_out);
      xmlFree(buf_out);
  
      if (store_doc)
          xmlFreeDoc(store_doc);
  
      /* extract conversion */
-    perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
+    perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
+
  
      /* finally, do the indexing */
-    if (doc)
-    {
+    if (doc){
          extract_dom_doc_node(tinfo, p, doc);
-        /* extract_doc_alvis(tinfo, p, doc); */
         xmlFreeDoc(doc);
      }
+    
+    /* there was nothing to index, so there is no inserted/updated record */
+    if (tinfo->record_info_invoked == 0)
+        return RECCTRL_EXTRACT_SKIP;
  
      return RECCTRL_EXTRACT_OK;
  }
@@ -1126,8 +1157,9 @@ static int extract_xml_split(struct filter_info *tinfo,
                                                     p /* I/O handler */,
                                                     0 /* URL */, 
                                                     0 /* encoding */,
-                                                   XML_PARSE_XINCLUDE|
-                                                   XML_PARSE_NOENT);
+                                                   XML_PARSE_XINCLUDE
+                                                   | XML_PARSE_NOENT
+                                                   | XML_PARSE_NONET);
      }
      if (!input->u.xmlreader.reader)
         return RECCTRL_EXTRACT_ERROR_GENERIC;
@@ -1137,18 +1169,37 @@ static int extract_xml_split(struct filter_info *tinfo,
      {
          int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
          int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
+
          if (type == XML_READER_TYPE_ELEMENT && 
              input->u.xmlreader.split_level == depth)
          {
-            xmlNodePtr ptr
-                = xmlTextReaderExpand(input->u.xmlreader.reader);
+            xmlNodePtr ptr;
+
+            /* per default do not ingest record */
+            tinfo->record_info_invoked = 0;
+            
+            ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
              if (ptr)
-            {
+                {
+                /* we have a new document */
+
                  xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
                  xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
                  
                  xmlDocSetRootElement(doc, ptr2);
                  
+                /* writing debug info out */
+                if (p->flagShowRecords)
+                {
+                    xmlChar *buf_out = 0;
+                    int len_out = 0;
+                    xmlDocDumpMemory(doc, &buf_out, &len_out);
+                    yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s", 
+                            tinfo->fname ? tinfo->fname : "(none)",
+                            depth, len_out, buf_out); 
+                    xmlFree(buf_out);
+                }
+                
                  return convert_extract_doc(tinfo, input, p, doc);
              }
              else
@@ -1175,7 +1226,9 @@ static int extract_xml_full(struct filter_info *tinfo,
                                    p /* I/O handler */,
                                    0 /* URL */,
                                    0 /* encoding */,
-                                  XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
+                                  XML_PARSE_XINCLUDE
+                                  | XML_PARSE_NOENT
+                                  | XML_PARSE_NONET);
          if (!doc)
          {
              return RECCTRL_EXTRACT_ERROR_GENERIC;
@@ -1200,9 +1253,9 @@ static int extract_iso2709(struct filter_info *tinfo,
      {
          int i;
  
-        yaz_log(YLOG_WARN, "%s dom filter: "
+        dom_log(YLOG_WARN, tinfo, 0,
                  "MARC: Skipping bad byte %d (0x%02X)",
-                tinfo->fname, *buf & 0xff, *buf & 0xff);
+                *buf & 0xff, *buf & 0xff);
          for (i = 0; i<4; i++)
              buf[i] = buf[i+1];
  
@@ -1212,32 +1265,31 @@ static int extract_iso2709(struct filter_info *tinfo,
      record_length = atoi_n (buf, 5);
      if (record_length < 25)
      {
-        yaz_log (YLOG_WARN, "%s dom filter: "
-                 "MARC record length < 25, is %d", 
-                 tinfo->fname, record_length);
+        dom_log(YLOG_WARN, tinfo, 0,
+                "MARC record length < 25, is %d",  record_length);
          return RECCTRL_EXTRACT_ERROR_GENERIC;
      }
      read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
      if (read_bytes < record_length-5)
      {
-        yaz_log (YLOG_WARN, "%s dom filter: "
-                 "Couldn't read whole MARC record",
-                 tinfo->fname);
+        dom_log(YLOG_WARN, tinfo, 0,
+                "couldn't read whole MARC record");
          return RECCTRL_EXTRACT_ERROR_GENERIC;
      }
      r = yaz_marc_read_iso2709(input->u.marc.handle,  buf, record_length);
      if (r < record_length)
      {
-        yaz_log (YLOG_WARN, "%s dom filter: "
-                 "Parsing of MARC record failed r=%d length=%d",
-                 tinfo->fname, r, record_length);
+        dom_log (YLOG_WARN, tinfo, 0,
+                 "parsing of MARC record failed r=%d length=%d",
+                 r, record_length);
          return RECCTRL_EXTRACT_ERROR_GENERIC;
      }
      else
      {
          xmlDocPtr rdoc;
          xmlNode *root_ptr;
-        yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
+        yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 
+                           "http://www.loc.gov/MARC21/slim", 0, 0);
          rdoc = xmlNewDoc((const xmlChar*) "1.0");
          xmlDocSetRootElement(rdoc, root_ptr);
          return convert_extract_doc(tinfo, input, p, rdoc);        
@@ -1252,8 +1304,11 @@ static int filter_extract(void *clientData, struct recExtractCtrl *p)
  
      if (!input)
          return RECCTRL_EXTRACT_ERROR_GENERIC;
-
+    
      odr_reset(tinfo->odr_record);
+
+    if (p->setStoreData == 0)
+        return extract_xml_full(tinfo, input, p);
      switch(input->type)
      {
      case DOM_INPUT_XMLREADER:
@@ -1309,6 +1364,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
      {
          p->diagnostic =
              YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
+        p->addinfo = odr_strdup(p->odr, esn);
          return 0;
      }
  
@@ -1336,7 +1392,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
      doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
                     0 /* URL */,
                     0 /* encoding */,
-                   XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
+                   XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
      if (!doc)
      {
          p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
@@ -1344,12 +1400,13 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
      }
  
      /* retrieve conversion */
-    perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
+    perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
      if (!doc)
      {
          p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
      }
-    else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
+    else if (!p->input_format
+             || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
      {
          xmlChar *buf_out;
          int len_out;
@@ -1359,13 +1416,13 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
          else
              xmlDocDumpMemory(doc, &buf_out, &len_out);            
  
-        p->output_format = VAL_TEXT_XML;
+        p->output_format = yaz_oid_recsyn_xml;
          p->rec_len = len_out;
          p->rec_buf = odr_malloc(p->odr, p->rec_len);
          memcpy(p->rec_buf, buf_out, p->rec_len);
          xmlFree(buf_out);
      }
-    else if (p->output_format == VAL_SUTRS)
+    else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
      {
          xmlChar *buf_out;
          int len_out;
@@ -1375,7 +1432,7 @@ static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
          else
              xmlDocDumpMemory(doc, &buf_out, &len_out);            
          
-        p->output_format = VAL_SUTRS;
+        p->output_format = yaz_oid_recsyn_sutrs;
          p->rec_len = len_out;
          p->rec_buf = odr_malloc(p->odr, p->rec_len);
          memcpy(p->rec_buf, buf_out, p->rec_len);