1 /* $Id: mod_dom.c,v 1.13 2007-02-15 14:33:41 marc Exp $
2 Copyright (C) 1995-2007
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
39 #include <libexslt/exslt.h>
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
47 /* Alvis style indexing */
48 #define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
49 static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
51 /* DOM filter style indexing */
52 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
53 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
55 /* DOM filter style indexing */
56 #define ZEBRA_PI_NAME "zebra-2.0"
57 static const char *zebra_pi_name = ZEBRA_PI_NAME;
62 const char *stylesheet;
63 xsltStylesheetPtr stylesheet_xsp;
64 struct convert_s *next;
67 struct filter_extract {
69 struct convert_s *convert;
73 struct convert_s *convert;
76 struct filter_retrieve {
78 const char *identifier;
79 struct convert_s *convert;
80 struct filter_retrieve *next;
83 #define DOM_INPUT_XMLREADER 1
84 #define DOM_INPUT_MARC 2
88 struct convert_s *convert;
92 const char *input_charset;
97 xmlTextReaderPtr reader;
101 struct filter_input *next;
107 const char *profile_path;
110 xmlDocPtr doc_config;
111 struct filter_extract *extract;
112 struct filter_retrieve *retrieve_list;
113 struct filter_input *input_list;
114 struct filter_store *store;
117 #define XML_STRCMP(a,b) strcmp((char*)a, b)
118 #define XML_STRLEN(a) strlen((char*)a)
123 static void set_param_str(const char **params, const char *name,
124 const char *value, ODR odr)
126 char *quoted = odr_malloc(odr, 3 + strlen(value));
127 sprintf(quoted, "'%s'", value);
135 static void set_param_int(const char **params, const char *name,
138 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
141 sprintf(quoted, "'" ZINT_FORMAT "'", value);
147 static void *filter_init(Res res, RecType recType)
149 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
151 tinfo->full_name = 0;
152 tinfo->profile_path = 0;
153 tinfo->odr_record = odr_createmem(ODR_ENCODE);
154 tinfo->odr_config = odr_createmem(ODR_ENCODE);
156 tinfo->retrieve_list = 0;
157 tinfo->input_list = 0;
159 tinfo->doc_config = 0;
168 static int attr_content(struct _xmlAttr *attr, const char *name,
169 const char **dst_content)
171 if (!XML_STRCMP(attr->name, name) && attr->children
172 && attr->children->type == XML_TEXT_NODE)
174 *dst_content = (const char *)(attr->children->content);
180 static void destroy_xsp(struct convert_s *c)
184 if (c->stylesheet_xsp)
185 xsltFreeStylesheet(c->stylesheet_xsp);
190 static void destroy_dom(struct filter_info *tinfo)
194 destroy_xsp(tinfo->extract->convert);
199 destroy_xsp(tinfo->store->convert);
202 if (tinfo->input_list)
204 struct filter_input *i_ptr;
205 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
209 case DOM_INPUT_XMLREADER:
210 if (i_ptr->u.xmlreader.reader)
211 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
214 yaz_iconv_close(i_ptr->u.marc.iconv);
215 yaz_marc_destroy(i_ptr->u.marc.handle);
218 destroy_xsp(i_ptr->convert);
220 tinfo->input_list = 0;
222 if (tinfo->retrieve_list)
224 struct filter_retrieve *r_ptr;
225 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
226 destroy_xsp(r_ptr->convert);
227 tinfo->retrieve_list = 0;
230 if (tinfo->doc_config)
232 xmlFreeDoc(tinfo->doc_config);
233 tinfo->doc_config = 0;
235 odr_reset(tinfo->odr_config);
238 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
239 struct convert_s **l)
242 for(; ptr; ptr = ptr->next)
244 if (ptr->type != XML_ELEMENT_NODE)
246 if (!XML_STRCMP(ptr->name, "xslt"))
248 struct _xmlAttr *attr;
250 = odr_malloc(tinfo->odr_config, sizeof(*p));
254 p->stylesheet_xsp = 0;
256 for (attr = ptr->properties; attr; attr = attr->next)
257 if (attr_content(attr, "stylesheet", &p->stylesheet))
260 yaz_log(YLOG_WARN, "%s: dom filter: "
261 "%s bad attribute @%s, "
262 "expected @stylesheet",
264 xmlGetNodePath(ptr), attr->name);
267 char tmp_xslt_full_name[1024];
268 if (!yaz_filepath_resolve(p->stylesheet,
273 yaz_log(YLOG_WARN, "%s: dom filter: "
274 "stylesheet %s not found in "
278 tinfo->profile_path);
283 = xsltParseStylesheetFile((const xmlChar*)
285 if (!p->stylesheet_xsp)
287 yaz_log(YLOG_WARN, "%s: dom filter: "
288 "could not parse xslt "
290 tinfo->fname, tmp_xslt_full_name);
296 yaz_log(YLOG_WARN, "%s: dom filter: "
297 "%s missing attribute 'stylesheet' ",
298 tinfo->fname, xmlGetNodePath(ptr));
309 tinfo->fname, xmlGetNodePath(ptr), ptr->name);
317 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
318 struct convert_s *convert,
321 xsltStylesheetPtr *last_xsp)
323 for (; convert; convert = convert->next)
325 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
328 *last_xsp = convert->stylesheet_xsp;
335 static struct filter_input *new_input(struct filter_info *tinfo, int type)
337 struct filter_input *p;
338 struct filter_input **np = &tinfo->input_list;
339 for (;*np; np = &(*np)->next)
341 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
350 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
354 for (; ptr; ptr = ptr->next)
356 if (ptr->type != XML_ELEMENT_NODE)
358 if (!XML_STRCMP(ptr->name, "marc"))
360 yaz_iconv_t iconv = 0;
361 const char *input_charset = "marc-8";
362 struct _xmlAttr *attr;
364 for (attr = ptr->properties; attr; attr = attr->next)
366 if (attr_content(attr, "charset", &input_charset))
369 yaz_log(YLOG_WARN, "%s: dom filter: "
370 "%s bad attribute @%s,"
371 " expected @charset",
373 xmlGetNodePath(ptr), attr->name);
375 iconv = yaz_iconv_open("utf-8", input_charset);
378 yaz_log(YLOG_WARN, "%s: dom filter: "
379 "%s unsupported @charset '%s'",
380 tinfo->fname, xmlGetNodePath(ptr),
386 struct filter_input *p
387 = new_input(tinfo, DOM_INPUT_MARC);
388 p->u.marc.handle = yaz_marc_create();
389 p->u.marc.iconv = iconv;
391 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
395 parse_convert(tinfo, ptr, &p->convert);
400 else if (!XML_STRCMP(ptr->name, "xmlreader"))
402 struct filter_input *p
403 = new_input(tinfo, DOM_INPUT_XMLREADER);
404 struct _xmlAttr *attr;
405 const char *level_str = 0;
407 p->u.xmlreader.split_level = 0;
408 p->u.xmlreader.reader = 0;
410 for (attr = ptr->properties; attr; attr = attr->next)
412 if (attr_content(attr, "level", &level_str))
415 yaz_log(YLOG_WARN, "%s: dom filter: "
416 "%s bad attribute @%s,"
418 tinfo->fname, xmlGetNodePath(ptr),
422 p->u.xmlreader.split_level = atoi(level_str);
426 parse_convert(tinfo, ptr, &p->convert);
431 yaz_log(YLOG_WARN, "%s: dom filter: "
432 "%s bad element <%s>,"
433 " expected <marc>|<xmlreader>",
434 tinfo->fname, xmlGetNodePath(ptr), ptr->name);
441 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
443 char tmp_full_name[1024];
447 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
449 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
450 NULL, tmp_full_name))
451 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
453 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
455 yaz_log(YLOG_LOG, "%s dom filter: "
456 "loading config file %s", tinfo->fname, tinfo->full_name);
458 doc = xmlParseFile(tinfo->full_name);
461 yaz_log(YLOG_WARN, "%s: dom filter: "
462 "failed to parse config file %s",
463 tinfo->fname, tinfo->full_name);
466 /* save because we store ptrs to the content */
467 tinfo->doc_config = doc;
469 ptr = xmlDocGetRootElement(doc);
470 if (!ptr || ptr->type != XML_ELEMENT_NODE
471 || XML_STRCMP(ptr->name, "dom"))
473 yaz_log(YLOG_WARN, "%s: dom filter: "
474 "%s bad root element <%s>,"
475 " expected root element <dom>",
476 tinfo->fname, xmlGetNodePath(ptr), ptr->name);
480 for (ptr = ptr->children; ptr; ptr = ptr->next)
482 if (ptr->type != XML_ELEMENT_NODE)
484 if (!XML_STRCMP(ptr->name, "extract"))
487 <extract name="index">
488 <xslt stylesheet="first.xsl"/>
489 <xslt stylesheet="second.xsl"/>
492 struct _xmlAttr *attr;
493 struct filter_extract *f =
494 odr_malloc(tinfo->odr_config, sizeof(*f));
499 for (attr = ptr->properties; attr; attr = attr->next)
501 if (attr_content(attr, "name", &f->name))
504 yaz_log(YLOG_WARN, "%s: dom filter: "
505 "%s bad attribute @%s"
508 xmlGetNodePath(ptr),attr->name);
511 parse_convert(tinfo, ptr->children, &f->convert);
513 else if (!XML_STRCMP(ptr->name, "retrieve"))
517 <xslt stylesheet="some.xsl"/>
518 <xslt stylesheet="some.xsl"/>
521 struct _xmlAttr *attr;
522 struct filter_retrieve **fp = &tinfo->retrieve_list;
523 struct filter_retrieve *f =
524 odr_malloc(tinfo->odr_config, sizeof(*f));
535 for (attr = ptr->properties; attr; attr = attr->next)
537 if (attr_content(attr, "identifier",
540 else if (attr_content(attr, "name", &f->name))
543 yaz_log(YLOG_WARN, "%s: dom filter: "
544 "%s bad attribute @%s"
545 " expected @identifier|@name",
547 xmlGetNodePath(ptr),attr->name);
549 parse_convert(tinfo, ptr->children, &f->convert);
551 else if (!XML_STRCMP(ptr->name, "store"))
555 <xslt stylesheet="some.xsl"/>
556 <xslt stylesheet="some.xsl"/>
559 struct filter_store *f =
560 odr_malloc(tinfo->odr_config, sizeof(*f));
564 parse_convert(tinfo, ptr->children, &f->convert);
566 else if (!XML_STRCMP(ptr->name, "input"))
570 <xmlreader level="1"/>
572 <input syntax="usmarc">
573 <marc inputcharset="marc-8"/>
576 struct _xmlAttr *attr;
577 const char *syntax = 0;
578 const char *name = 0;
579 for (attr = ptr->properties; attr; attr = attr->next)
581 if (attr_content(attr, "syntax", &syntax))
583 else if (attr_content(attr, "name", &name))
586 yaz_log(YLOG_WARN, "%s: dom filter: "
587 "%s bad attribute @%s"
588 " expected @syntax|@name",
590 xmlGetNodePath(ptr),attr->name);
592 parse_input(tinfo, ptr->children, syntax, name);
596 yaz_log(YLOG_WARN, "%s: dom filter: "
597 "%s bad element <%s>,"
598 " expected <extract>|<input>|<retrieve>|<store>",
599 tinfo->fname, xmlGetNodePath(ptr), ptr->name);
606 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
609 struct filter_retrieve *f = tinfo->retrieve_list;
611 /* return first schema if no est is provided */
614 for (; f; f = f->next)
616 /* find requested schema */
619 if (f->identifier && !strcmp(f->identifier, est))
621 if (f->name && !strcmp(f->name, est))
628 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
630 struct filter_info *tinfo = clientData;
633 yaz_log(YLOG_WARN, "dom filter: need config file");
637 if (tinfo->fname && !strcmp(args, tinfo->fname))
640 tinfo->profile_path = res_get(res, "profilePath");
643 return parse_dom(tinfo, args);
646 static void filter_destroy(void *clientData)
648 struct filter_info *tinfo = clientData;
650 odr_destroy(tinfo->odr_config);
651 odr_destroy(tinfo->odr_record);
655 static int ioread_ex(void *context, char *buffer, int len)
657 struct recExtractCtrl *p = context;
658 return p->stream->readf(p->stream, buffer, len);
661 static int ioclose_ex(void *context)
668 /* Alvis style indexing */
669 static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
670 xmlNodePtr ptr, RecWord *recWord)
672 for(; ptr; ptr = ptr->next)
674 index_cdata(tinfo, ctrl, ptr->children, recWord);
675 if (ptr->type != XML_TEXT_NODE)
677 recWord->term_buf = (const char *)ptr->content;
678 recWord->term_len = XML_STRLEN(ptr->content);
679 (*ctrl->tokenAdd)(recWord);
683 /* Alvis style indexing */
684 static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
685 xmlNodePtr ptr, RecWord *recWord)
687 for(; ptr; ptr = ptr->next)
689 index_node(tinfo, ctrl, ptr->children, recWord);
690 if (ptr->type != XML_ELEMENT_NODE || !ptr->ns ||
691 XML_STRCMP(ptr->ns->href, zebra_xslt_ns))
693 if (!XML_STRCMP(ptr->name, "index"))
695 const char *name_str = 0;
696 const char *type_str = 0;
697 const char *xpath_str = 0;
698 struct _xmlAttr *attr;
699 for (attr = ptr->properties; attr; attr = attr->next)
701 if (attr_content(attr, "name", &name_str))
703 else if (attr_content(attr, "xpath", &xpath_str))
705 else if (attr_content(attr, "type", &type_str))
708 yaz_log(YLOG_WARN, "%s: dom filter: "
709 "bad attribute %s for <index>",
710 tinfo->fname, attr->name);
714 /* save default type */
715 int prev_type = recWord->index_type;
718 if (type_str && *type_str)
719 recWord->index_type = *type_str;
721 recWord->index_name = name_str;
722 index_cdata(tinfo, ctrl, ptr->children, recWord);
724 /* restore it again */
725 recWord->index_type = prev_type;
731 /* Alvis style indexing */
732 static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
733 xmlNodePtr ptr, RecWord *recWord)
735 const char *type_str = "update";
737 if (ptr && ptr->type == XML_ELEMENT_NODE && ptr->ns &&
738 !XML_STRCMP(ptr->ns->href, zebra_xslt_ns)
739 && !XML_STRCMP(ptr->name, "record"))
741 const char *id_str = 0;
742 const char *rank_str = 0;
743 struct _xmlAttr *attr;
744 for (attr = ptr->properties; attr; attr = attr->next)
746 if (attr_content(attr, "type", &type_str))
748 else if (attr_content(attr, "id", &id_str))
750 else if (attr_content(attr, "rank", &rank_str))
753 yaz_log(YLOG_WARN, "%s: dom filter: "
754 "bad attribute %s for <record>",
755 tinfo->fname, attr->name);
758 sscanf(id_str, "%255s", ctrl->match_criteria);
761 ctrl->staticrank = atozint(rank_str);
765 if (!strcmp("update", type_str))
766 index_node(tinfo, ctrl, ptr, recWord);
767 else if (!strcmp("delete", type_str))
768 yaz_log(YLOG_WARN, "%s dom filter: "
769 "delete: to be implemented");
771 yaz_log(YLOG_WARN, "dom filter: "
772 "unknown record type '%s'",
777 /* Alvis style indexing */
778 static void extract_doc_alvis(struct filter_info *tinfo,
779 struct recExtractCtrl *extctr,
788 (*extctr->init)(extctr, &recWord);
790 if (extctr->flagShowRecords){
791 xmlDocDumpMemory(doc, &buf_out, &len_out);
792 fwrite(buf_out, len_out, 1, stdout);
795 root_ptr = xmlDocGetRootElement(doc);
797 index_record(tinfo, extctr, root_ptr, &recWord);
799 yaz_log(YLOG_WARN, "%s dom filter: "
800 "No root for index XML record");
805 /* DOM filter style indexing */
806 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
807 xmlChar **dst_content)
809 if (0 == XML_STRCMP(attr->name, name) && attr->children
810 && attr->children->type == XML_TEXT_NODE)
812 *dst_content = (attr->children->content);
819 /* DOM filter style indexing */
820 static void index_value_of(struct filter_info *tinfo,
821 struct recExtractCtrl *extctr,
825 xmlChar *text = xmlNodeGetContent(node);
826 size_t text_len = strlen((const char *)text);
829 /* if there is no text, we do not need to proceed */
832 xmlChar *look = index_p;
839 /* assingning text to be indexed */
841 (*extctr->init)(extctr, &recWord);
842 recWord.term_buf = (const char *)text;
843 recWord.term_len = text_len;
845 /* parsing all index name/type pairs */
846 /* may not start with ' ' or ':' */
847 while (*look && ' ' != *look && ':' != *look){
849 /* setting name and type to zero */
853 /* parsing one index name */
855 while (*look && ':' != *look && ' ' != *look){
859 strncpy((char *)index, (const char *)bval, eval - bval);
860 index[eval - bval] = '\0';
863 /* parsing one index type, if existing */
868 while (*look && ' ' != *look){
872 strncpy((char *)type, (const char *)bval, eval - bval);
873 type[eval - bval] = '\0';
876 /* actually indexing the text given */
877 yaz_log(YLOG_DEBUG, "%s dom filter: "
878 "INDEX '%s:%s' '%s'",
879 tinfo->fname, index, type, text);
881 recWord.index_name = (const char *)index;
883 recWord.index_type = *type;
884 (extctr->tokenAdd)(&recWord);
886 /* eat whitespaces */
887 if (*look && ' ' == *look && *(look+1)){
897 /* DOM filter style indexing */
898 static void set_record_info(struct filter_info *tinfo,
899 struct recExtractCtrl *extctr,
904 yaz_log(YLOG_DEBUG, "%s dom filter: "
905 "RECORD id=%s rank=%s type=%s",
906 tinfo->fname, id_p, rank_p, type_p);
909 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
912 extctr->staticrank = atozint((const char *)rank_p);
914 /* if (!strcmp("update", type_str)) */
915 /* index_node(tinfo, ctrl, ptr, recWord); */
916 /* else if (!strcmp("delete", type_str)) */
917 /* yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
919 /* yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", */
925 /* DOM filter style indexing */
926 static void process_xml_element_zebra_node(struct filter_info *tinfo,
927 struct recExtractCtrl *extctr,
930 if (node->type == XML_ELEMENT_NODE
931 && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)){
933 if (0 == XML_STRCMP(node->name, "index")){
934 xmlChar *index_p = 0;
936 struct _xmlAttr *attr;
937 for (attr = node->properties; attr; attr = attr->next){
938 if (attr_content_xml(attr, "name", &index_p)){
939 index_value_of(tinfo, extctr, node, index_p);
942 yaz_log(YLOG_WARN,"%s dom filter: "
943 "%s bad attribute @%s, expected @name",
944 tinfo->fname, xmlGetNodePath(node), attr->name);
947 else if (0 == XML_STRCMP(node->name, "record")){
952 struct _xmlAttr *attr;
953 for (attr = node->properties; attr; attr = attr->next){
954 if (attr_content_xml(attr, "id", &id_p))
956 else if (attr_content_xml(attr, "rank", &rank_p))
958 else if (attr_content_xml(attr, "type", &type_p))
961 yaz_log(YLOG_WARN,"%s dom filter: "
962 "%s bad attribute @%s,"
963 " expected @id|@rank|@type",
964 tinfo->fname, xmlGetNodePath(node), attr->name);
966 if (type_p && 0 != strcmp("update", (const char *)type_p))
967 yaz_log(YLOG_WARN,"%s dom filter: "
969 " only implemented '@type='update'",
970 tinfo->fname, xmlGetNodePath(node), attr->name);
974 set_record_info(tinfo, extctr, id_p, rank_p, type_p);
976 yaz_log(YLOG_WARN,"%s dom filter: "
977 "%s bad element <%s>,"
978 " expected <record>|<index> in namespace '%s'",
979 tinfo->fname, xmlGetNodePath(node),
980 node->name, zebra_dom_ns);
987 /* DOM filter style indexing */
988 static void process_xml_pi_node(struct filter_info *tinfo,
989 struct recExtractCtrl *extctr,
994 /* yaz_log(YLOG_DEBUG,"PI %s\n", xmlGetNodePath(node)); */
996 /* if right PI name, continue parsing PI */
997 if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
998 xmlChar *pi_p = node->content;
999 xmlChar *look = pi_p;
1004 /* parsing PI record instructions */
1005 if (0 == strncmp((const char *)look, "record", 6)){
1016 /* eat whitespace */
1017 while (*look && ' ' == *look && *(look+1))
1020 /* parse possible id */
1021 if (*look && 0 == strncmp((const char *)look, "id=", 3)){
1024 while (*look && ' ' != *look)
1027 strncpy((char *)id, (const char *)bval, eval - bval);
1028 id[eval - bval] = '\0';
1031 /* eat whitespace */
1032 while (*look && ' ' == *look && *(look+1))
1035 /* parse possible rank */
1036 if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
1039 while (*look && ' ' != *look)
1042 strncpy((char *)rank, (const char *)bval, eval - bval);
1043 rank[eval - bval] = '\0';
1046 /* eat whitespace */
1047 while (*look && ' ' == *look && *(look+1))
1050 if (look && '\0' != *look)
1051 yaz_log(YLOG_WARN,"%s dom filter: "
1052 "%s content '%s', can not parse '%s'",
1053 tinfo->fname, xmlGetNodePath(node), pi_p, look);
1055 set_record_info(tinfo, extctr, id, rank, 0);
1059 /* parsing index instruction */
1060 else if (0 == strncmp((const char *)look, "index", 5)){
1063 /* eat whitespace */
1064 while (*look && ' ' == *look && *(look+1))
1067 /* export index instructions to outside */
1071 yaz_log(YLOG_WARN,"%s dom filter: "
1072 "%s content '%s', can not parse '%s'",
1073 tinfo->fname, xmlGetNodePath(node), pi_p, look);
1077 /* DOM filter style indexing */
1078 static void process_xml_element_node(struct filter_info *tinfo,
1079 struct recExtractCtrl *extctr,
1082 /* remember indexing instruction from PI to next element node */
1083 xmlChar *index_p = 0;
1085 /* yaz_log(YLOG_DEBUG,"ELEM %s\n", xmlGetNodePath(node)); */
1087 /* check if we are an element node in the special zebra namespace
1088 and either set record data or index value-of node content*/
1089 process_xml_element_zebra_node(tinfo, extctr, node);
1091 /* loop through kid nodes */
1092 for (node = node->children; node; node = node->next)
1094 /* check and set PI record and index index instructions */
1095 if (node->type == XML_PI_NODE){
1096 process_xml_pi_node(tinfo, extctr, node, &index_p);
1098 else if (node->type == XML_ELEMENT_NODE){
1099 /* if there was a PI index instruction before this element */
1101 index_value_of(tinfo, extctr, node, index_p);
1104 process_xml_element_node(tinfo, extctr, node);
1112 /* DOM filter style indexing */
1113 static void extract_dom_doc_node(struct filter_info *tinfo,
1114 struct recExtractCtrl *extctr,
1117 /* yaz_log(YLOG_DEBUG,"DOC %s\n", xmlGetNodePath((xmlNodePtr)doc)); */
1121 if (extctr->flagShowRecords){
1122 xmlDocDumpMemory(doc, &buf_out, &len_out);
1123 fwrite(buf_out, len_out, 1, stdout);
1127 process_xml_element_node(tinfo, extctr, (xmlNodePtr)doc);
1133 static int convert_extract_doc(struct filter_info *tinfo,
1134 struct filter_input *input,
1135 struct recExtractCtrl *p,
1141 const char *params[10];
1142 xsltStylesheetPtr last_xsp = 0;
1143 xmlDocPtr store_doc = 0;
1146 set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1148 /* input conversion */
1149 perform_convert(tinfo, input->convert, params, &doc, 0);
1153 /* store conversion */
1154 store_doc = xmlCopyDoc(doc, 1);
1155 perform_convert(tinfo, tinfo->store->convert,
1156 params, &store_doc, &last_xsp);
1160 xsltSaveResultToString(&buf_out, &len_out,
1161 store_doc ? store_doc : doc, last_xsp);
1163 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1164 if (p->flagShowRecords)
1165 fwrite(buf_out, len_out, 1, stdout);
1166 (*p->setStoreData)(p, buf_out, len_out);
1170 xmlFreeDoc(store_doc);
1172 /* extract conversion */
1173 perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1175 /* finally, do the indexing */
1177 extract_dom_doc_node(tinfo, p, doc);
1178 /* extract_doc_alvis(tinfo, p, doc); */
1182 return RECCTRL_EXTRACT_OK;
1185 static int extract_xml_split(struct filter_info *tinfo,
1186 struct filter_input *input,
1187 struct recExtractCtrl *p)
1191 if (p->first_record)
1193 if (input->u.xmlreader.reader)
1194 xmlFreeTextReader(input->u.xmlreader.reader);
1195 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1196 p /* I/O handler */,
1202 if (!input->u.xmlreader.reader)
1203 return RECCTRL_EXTRACT_ERROR_GENERIC;
1205 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1208 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1209 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1210 if (type == XML_READER_TYPE_ELEMENT &&
1211 input->u.xmlreader.split_level == depth)
1214 = xmlTextReaderExpand(input->u.xmlreader.reader);
1217 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1218 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1220 xmlDocSetRootElement(doc, ptr2);
1222 return convert_extract_doc(tinfo, input, p, doc);
1226 xmlFreeTextReader(input->u.xmlreader.reader);
1227 input->u.xmlreader.reader = 0;
1228 return RECCTRL_EXTRACT_ERROR_GENERIC;
1231 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1233 xmlFreeTextReader(input->u.xmlreader.reader);
1234 input->u.xmlreader.reader = 0;
1235 return RECCTRL_EXTRACT_EOF;
1238 static int extract_xml_full(struct filter_info *tinfo,
1239 struct filter_input *input,
1240 struct recExtractCtrl *p)
1242 if (p->first_record) /* only one record per stream */
1244 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1245 p /* I/O handler */,
1248 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1251 return RECCTRL_EXTRACT_ERROR_GENERIC;
1253 return convert_extract_doc(tinfo, input, p, doc);
1256 return RECCTRL_EXTRACT_EOF;
1259 static int extract_iso2709(struct filter_info *tinfo,
1260 struct filter_input *input,
1261 struct recExtractCtrl *p)
1267 if (p->stream->readf(p->stream, buf, 5) != 5)
1268 return RECCTRL_EXTRACT_EOF;
1269 while (*buf < '0' || *buf > '9')
1273 yaz_log(YLOG_WARN, "%s dom filter: "
1274 "MARC: Skipping bad byte %d (0x%02X)",
1275 tinfo->fname, *buf & 0xff, *buf & 0xff);
1276 for (i = 0; i<4; i++)
1279 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1280 return RECCTRL_EXTRACT_EOF;
1282 record_length = atoi_n (buf, 5);
1283 if (record_length < 25)
1285 yaz_log (YLOG_WARN, "%s dom filter: "
1286 "MARC record length < 25, is %d",
1287 tinfo->fname, record_length);
1288 return RECCTRL_EXTRACT_ERROR_GENERIC;
1290 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1291 if (read_bytes < record_length-5)
1293 yaz_log (YLOG_WARN, "%s dom filter: "
1294 "Couldn't read whole MARC record",
1296 return RECCTRL_EXTRACT_ERROR_GENERIC;
1298 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1299 if (r < record_length)
1301 yaz_log (YLOG_WARN, "%s dom filter: "
1302 "Parsing of MARC record failed r=%d length=%d",
1303 tinfo->fname, r, record_length);
1304 return RECCTRL_EXTRACT_ERROR_GENERIC;
1310 yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1311 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1312 xmlDocSetRootElement(rdoc, root_ptr);
1313 return convert_extract_doc(tinfo, input, p, rdoc);
1315 return RECCTRL_EXTRACT_OK;
1318 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1320 struct filter_info *tinfo = clientData;
1321 struct filter_input *input = tinfo->input_list;
1324 return RECCTRL_EXTRACT_ERROR_GENERIC;
1326 odr_reset(tinfo->odr_record);
1329 case DOM_INPUT_XMLREADER:
1330 if (input->u.xmlreader.split_level == 0)
1331 return extract_xml_full(tinfo, input, p);
1333 return extract_xml_split(tinfo, input, p);
1335 case DOM_INPUT_MARC:
1336 return extract_iso2709(tinfo, input, p);
1338 return RECCTRL_EXTRACT_ERROR_GENERIC;
1341 static int ioread_ret(void *context, char *buffer, int len)
1343 struct recRetrieveCtrl *p = context;
1344 return p->stream->readf(p->stream, buffer, len);
1347 static int ioclose_ret(void *context)
1352 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1354 /* const char *esn = zebra_dom_ns; */
1355 const char *esn = 0;
1356 const char *params[32];
1357 struct filter_info *tinfo = clientData;
1359 struct filter_retrieve *retrieve;
1360 xsltStylesheetPtr last_xsp = 0;
1364 if (p->comp->which == Z_RecordComp_simple
1365 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1367 esn = p->comp->u.simple->u.generic;
1369 else if (p->comp->which == Z_RecordComp_complex
1370 && p->comp->u.complex->generic->elementSpec
1371 && p->comp->u.complex->generic->elementSpec->which ==
1372 Z_ElementSpec_elementSetName)
1374 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1377 retrieve = lookup_retrieve(tinfo, esn);
1381 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1386 set_param_int(params, "id", p->localno, p->odr);
1388 set_param_str(params, "filename", p->fname, p->odr);
1389 if (p->staticrank >= 0)
1390 set_param_int(params, "rank", p->staticrank, p->odr);
1393 set_param_str(params, "schema", esn, p->odr);
1396 set_param_str(params, "schema", retrieve->name, p->odr);
1397 else if (retrieve->identifier)
1398 set_param_str(params, "schema", retrieve->identifier, p->odr);
1400 set_param_str(params, "schema", "", p->odr);
1403 set_param_int(params, "score", p->score, p->odr);
1404 set_param_int(params, "size", p->recordSize, p->odr);
1406 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1409 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1412 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1416 /* retrieve conversion */
1417 perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1420 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1422 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1428 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1430 xmlDocDumpMemory(doc, &buf_out, &len_out);
1432 p->output_format = VAL_TEXT_XML;
1433 p->rec_len = len_out;
1434 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1435 memcpy(p->rec_buf, buf_out, p->rec_len);
1438 else if (p->output_format == VAL_SUTRS)
1444 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1446 xmlDocDumpMemory(doc, &buf_out, &len_out);
1448 p->output_format = VAL_SUTRS;
1449 p->rec_len = len_out;
1450 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1451 memcpy(p->rec_buf, buf_out, p->rec_len);
1457 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1463 static struct recType filter_type = {
1474 #ifdef IDZEBRA_STATIC_DOM
1487 * indent-tabs-mode: nil
1489 * vim: shiftwidth=4 tabstop=8 expandtab