1 /* $Id: mod_dom.c,v 1.12 2007-02-15 13:01:00 marc Exp $
2 Copyright (C) 1995-2007
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 #include <yaz/diagbib1.h>
28 #include <yaz/tpath.h>
30 #include <libxml/xmlversion.h>
31 #include <libxml/parser.h>
32 #include <libxml/tree.h>
33 #include <libxml/xmlIO.h>
34 #include <libxml/xmlreader.h>
35 #include <libxslt/transform.h>
36 #include <libxslt/xsltutils.h>
39 #include <libexslt/exslt.h>
42 #include <idzebra/util.h>
43 #include <idzebra/recctrl.h>
47 /* Alvis style indexing */
48 #define ZEBRA_SCHEMA_XSLT_NS "http://indexdata.dk/zebra/xslt/1"
49 static const char *zebra_xslt_ns = ZEBRA_SCHEMA_XSLT_NS;
51 /* DOM filter style indexing */
52 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
53 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
55 /* DOM filter style indexing */
56 #define ZEBRA_PI_NAME "zebra-2.0"
57 static const char *zebra_pi_name = ZEBRA_PI_NAME;
62 const char *stylesheet;
63 xsltStylesheetPtr stylesheet_xsp;
64 struct convert_s *next;
67 struct filter_extract {
69 struct convert_s *convert;
73 struct convert_s *convert;
76 struct filter_retrieve {
78 const char *identifier;
79 struct convert_s *convert;
80 struct filter_retrieve *next;
83 #define DOM_INPUT_XMLREADER 1
84 #define DOM_INPUT_MARC 2
88 struct convert_s *convert;
92 const char *input_charset;
97 xmlTextReaderPtr reader;
101 struct filter_input *next;
107 const char *profile_path;
110 xmlDocPtr doc_config;
111 struct filter_extract *extract;
112 struct filter_retrieve *retrieve_list;
113 struct filter_input *input_list;
114 struct filter_store *store;
117 #define XML_STRCMP(a,b) strcmp((char*)a, b)
118 #define XML_STRLEN(a) strlen((char*)a)
123 static void set_param_str(const char **params, const char *name,
124 const char *value, ODR odr)
126 char *quoted = odr_malloc(odr, 3 + strlen(value));
127 sprintf(quoted, "'%s'", value);
135 static void set_param_int(const char **params, const char *name,
138 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
141 sprintf(quoted, "'" ZINT_FORMAT "'", value);
147 static void *filter_init(Res res, RecType recType)
149 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
151 tinfo->full_name = 0;
152 tinfo->profile_path = 0;
153 tinfo->odr_record = odr_createmem(ODR_ENCODE);
154 tinfo->odr_config = odr_createmem(ODR_ENCODE);
156 tinfo->retrieve_list = 0;
157 tinfo->input_list = 0;
159 tinfo->doc_config = 0;
168 static int attr_content(struct _xmlAttr *attr, const char *name,
169 const char **dst_content)
171 if (!XML_STRCMP(attr->name, name) && attr->children
172 && attr->children->type == XML_TEXT_NODE)
174 *dst_content = (const char *)(attr->children->content);
180 static void destroy_xsp(struct convert_s *c)
184 if (c->stylesheet_xsp)
185 xsltFreeStylesheet(c->stylesheet_xsp);
190 static void destroy_dom(struct filter_info *tinfo)
194 destroy_xsp(tinfo->extract->convert);
199 destroy_xsp(tinfo->store->convert);
202 if (tinfo->input_list)
204 struct filter_input *i_ptr;
205 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
209 case DOM_INPUT_XMLREADER:
210 if (i_ptr->u.xmlreader.reader)
211 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
214 yaz_iconv_close(i_ptr->u.marc.iconv);
215 yaz_marc_destroy(i_ptr->u.marc.handle);
218 destroy_xsp(i_ptr->convert);
220 tinfo->input_list = 0;
222 if (tinfo->retrieve_list)
224 struct filter_retrieve *r_ptr;
225 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
226 destroy_xsp(r_ptr->convert);
227 tinfo->retrieve_list = 0;
230 if (tinfo->doc_config)
232 xmlFreeDoc(tinfo->doc_config);
233 tinfo->doc_config = 0;
235 odr_reset(tinfo->odr_config);
238 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
239 struct convert_s **l)
242 for(; ptr; ptr = ptr->next)
244 if (ptr->type != XML_ELEMENT_NODE)
246 if (!XML_STRCMP(ptr->name, "xslt"))
248 struct _xmlAttr *attr;
250 = odr_malloc(tinfo->odr_config, sizeof(*p));
254 p->stylesheet_xsp = 0;
256 for (attr = ptr->properties; attr; attr = attr->next)
257 if (attr_content(attr, "stylesheet", &p->stylesheet))
260 yaz_log(YLOG_WARN, "%s: dom filter: "
263 tinfo->fname, attr->name);
266 char tmp_xslt_full_name[1024];
267 if (!yaz_filepath_resolve(p->stylesheet,
274 "stylesheet %s not found in "
278 tinfo->profile_path);
283 = xsltParseStylesheetFile((const xmlChar*)
285 if (!p->stylesheet_xsp)
289 "could not parse xslt "
291 tinfo->fname, tmp_xslt_full_name);
299 "missing attribute 'stylesheet' "
300 "for element 'xslt'", tinfo->fname);
309 "%s: dom filter: bad node '%s' for <conv>",
310 tinfo->fname, ptr->name);
318 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
319 struct convert_s *convert,
322 xsltStylesheetPtr *last_xsp)
324 for (; convert; convert = convert->next)
326 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
329 *last_xsp = convert->stylesheet_xsp;
336 static struct filter_input *new_input(struct filter_info *tinfo, int type)
338 struct filter_input *p;
339 struct filter_input **np = &tinfo->input_list;
340 for (;*np; np = &(*np)->next)
342 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
351 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
355 for (; ptr; ptr = ptr->next)
357 if (ptr->type != XML_ELEMENT_NODE)
359 if (!XML_STRCMP(ptr->name, "marc"))
361 yaz_iconv_t iconv = 0;
362 const char *input_charset = "marc-8";
363 struct _xmlAttr *attr;
365 for (attr = ptr->properties; attr; attr = attr->next)
367 if (attr_content(attr, "charset", &input_charset))
371 "%s: dom filter: bad attribute %s"
373 tinfo->fname, attr->name);
375 iconv = yaz_iconv_open("utf-8", input_charset);
379 "%s: dom filter: unsupported charset "
381 tinfo->fname, input_charset);
386 struct filter_input *p
387 = new_input(tinfo, DOM_INPUT_MARC);
388 p->u.marc.handle = yaz_marc_create();
389 p->u.marc.iconv = iconv;
391 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
395 parse_convert(tinfo, ptr, &p->convert);
400 else if (!XML_STRCMP(ptr->name, "xmlreader"))
402 struct filter_input *p
403 = new_input(tinfo, DOM_INPUT_XMLREADER);
404 struct _xmlAttr *attr;
405 const char *level_str = 0;
407 p->u.xmlreader.split_level = 0;
408 p->u.xmlreader.reader = 0;
410 for (attr = ptr->properties; attr; attr = attr->next)
412 if (attr_content(attr, "level", &level_str))
416 "%s: dom filter: bad attribute %s"
418 tinfo->fname, attr->name);
421 p->u.xmlreader.split_level = atoi(level_str);
425 parse_convert(tinfo, ptr, &p->convert);
430 yaz_log(YLOG_WARN, "%s: dom filter: bad input type %s",
431 tinfo->fname, ptr->name);
438 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
440 char tmp_full_name[1024];
444 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
446 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
447 NULL, tmp_full_name))
448 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
450 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
452 yaz_log(YLOG_LOG, "dom filter: loading config file %s", tinfo->full_name);
454 doc = xmlParseFile(tinfo->full_name);
458 "%s: dom filter: failed to parse config file %s",
459 tinfo->fname, tinfo->full_name);
462 /* save because we store ptrs to the content */
463 tinfo->doc_config = doc;
465 ptr = xmlDocGetRootElement(doc);
466 if (!ptr || ptr->type != XML_ELEMENT_NODE
467 || XML_STRCMP(ptr->name, "dom"))
470 "%s: dom filter: expected root element <dom>",
475 for (ptr = ptr->children; ptr; ptr = ptr->next)
477 if (ptr->type != XML_ELEMENT_NODE)
479 if (!XML_STRCMP(ptr->name, "extract"))
482 <extract name="index">
483 <xslt stylesheet="first.xsl"/>
484 <xslt stylesheet="second.xsl"/>
487 struct _xmlAttr *attr;
488 struct filter_extract *f =
489 odr_malloc(tinfo->odr_config, sizeof(*f));
494 for (attr = ptr->properties; attr; attr = attr->next)
496 if (attr_content(attr, "name", &f->name))
500 "%s: dom filter: bad attribute %s"
502 tinfo->fname, attr->name);
505 parse_convert(tinfo, ptr->children, &f->convert);
507 else if (!XML_STRCMP(ptr->name, "retrieve"))
511 <xslt stylesheet="some.xsl"/>
512 <xslt stylesheet="some.xsl"/>
515 struct _xmlAttr *attr;
516 struct filter_retrieve **fp = &tinfo->retrieve_list;
517 struct filter_retrieve *f =
518 odr_malloc(tinfo->odr_config, sizeof(*f));
529 for (attr = ptr->properties; attr; attr = attr->next)
531 if (attr_content(attr, "identifier",
534 else if (attr_content(attr, "name", &f->name))
538 "%s: dom filter: bad attribute %s"
540 tinfo->fname, attr->name);
542 parse_convert(tinfo, ptr->children, &f->convert);
544 else if (!XML_STRCMP(ptr->name, "store"))
548 <xslt stylesheet="some.xsl"/>
549 <xslt stylesheet="some.xsl"/>
552 struct filter_store *f =
553 odr_malloc(tinfo->odr_config, sizeof(*f));
557 parse_convert(tinfo, ptr->children, &f->convert);
559 else if (!XML_STRCMP(ptr->name, "input"))
563 <xmlreader level="1"/>
565 <input syntax="usmarc">
566 <marc inputcharset="marc-8"/>
569 struct _xmlAttr *attr;
570 const char *syntax = 0;
571 const char *name = 0;
572 for (attr = ptr->properties; attr; attr = attr->next)
574 if (attr_content(attr, "syntax", &syntax))
576 else if (attr_content(attr, "name", &name))
580 "%s: dom filter: bad attribute %s"
582 tinfo->fname, attr->name);
584 parse_input(tinfo, ptr->children, syntax, name);
588 yaz_log(YLOG_WARN, "%s: dom filter: bad element %s",
589 tinfo->fname, ptr->name);
596 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
599 struct filter_retrieve *f = tinfo->retrieve_list;
601 /* return first schema if no est is provided */
604 for (; f; f = f->next)
606 /* find requested schema */
609 if (f->identifier && !strcmp(f->identifier, est))
611 if (f->name && !strcmp(f->name, est))
618 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
620 struct filter_info *tinfo = clientData;
623 yaz_log(YLOG_WARN, "dom filter: need config file");
627 if (tinfo->fname && !strcmp(args, tinfo->fname))
630 tinfo->profile_path = res_get(res, "profilePath");
633 return parse_dom(tinfo, args);
636 static void filter_destroy(void *clientData)
638 struct filter_info *tinfo = clientData;
640 odr_destroy(tinfo->odr_config);
641 odr_destroy(tinfo->odr_record);
645 static int ioread_ex(void *context, char *buffer, int len)
647 struct recExtractCtrl *p = context;
648 return p->stream->readf(p->stream, buffer, len);
651 static int ioclose_ex(void *context)
658 /* Alvis style indexing */
659 static void index_cdata(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
660 xmlNodePtr ptr, RecWord *recWord)
662 for(; ptr; ptr = ptr->next)
664 index_cdata(tinfo, ctrl, ptr->children, recWord);
665 if (ptr->type != XML_TEXT_NODE)
667 recWord->term_buf = (const char *)ptr->content;
668 recWord->term_len = XML_STRLEN(ptr->content);
669 (*ctrl->tokenAdd)(recWord);
673 /* Alvis style indexing */
674 static void index_node(struct filter_info *tinfo, struct recExtractCtrl *ctrl,
675 xmlNodePtr ptr, RecWord *recWord)
677 for(; ptr; ptr = ptr->next)
679 index_node(tinfo, ctrl, ptr->children, recWord);
680 if (ptr->type != XML_ELEMENT_NODE || !ptr->ns ||
681 XML_STRCMP(ptr->ns->href, zebra_xslt_ns))
683 if (!XML_STRCMP(ptr->name, "index"))
685 const char *name_str = 0;
686 const char *type_str = 0;
687 const char *xpath_str = 0;
688 struct _xmlAttr *attr;
689 for (attr = ptr->properties; attr; attr = attr->next)
691 if (attr_content(attr, "name", &name_str))
693 else if (attr_content(attr, "xpath", &xpath_str))
695 else if (attr_content(attr, "type", &type_str))
699 "%s: dom filter: bad attribute %s"
701 tinfo->fname, attr->name);
705 /* save default type */
706 int prev_type = recWord->index_type;
709 if (type_str && *type_str)
710 recWord->index_type = *type_str;
712 recWord->index_name = name_str;
713 index_cdata(tinfo, ctrl, ptr->children, recWord);
715 /* restore it again */
716 recWord->index_type = prev_type;
722 /* Alvis style indexing */
723 static void index_record(struct filter_info *tinfo,struct recExtractCtrl *ctrl,
724 xmlNodePtr ptr, RecWord *recWord)
726 const char *type_str = "update";
728 if (ptr && ptr->type == XML_ELEMENT_NODE && ptr->ns &&
729 !XML_STRCMP(ptr->ns->href, zebra_xslt_ns)
730 && !XML_STRCMP(ptr->name, "record"))
732 const char *id_str = 0;
733 const char *rank_str = 0;
734 struct _xmlAttr *attr;
735 for (attr = ptr->properties; attr; attr = attr->next)
737 if (attr_content(attr, "type", &type_str))
739 else if (attr_content(attr, "id", &id_str))
741 else if (attr_content(attr, "rank", &rank_str))
744 yaz_log(YLOG_WARN, "%s: dom filter: bad attribute %s"
746 tinfo->fname, attr->name);
749 sscanf(id_str, "%255s", ctrl->match_criteria);
752 ctrl->staticrank = atozint(rank_str);
756 if (!strcmp("update", type_str))
757 index_node(tinfo, ctrl, ptr, recWord);
758 else if (!strcmp("delete", type_str))
759 yaz_log(YLOG_WARN, "dom filter delete: to be implemented");
761 yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'",
766 /* Alvis style indexing */
767 static void extract_doc_alvis(struct filter_info *tinfo,
768 struct recExtractCtrl *extctr,
777 (*extctr->init)(extctr, &recWord);
779 if (extctr->flagShowRecords){
780 xmlDocDumpMemory(doc, &buf_out, &len_out);
781 fwrite(buf_out, len_out, 1, stdout);
784 root_ptr = xmlDocGetRootElement(doc);
786 index_record(tinfo, extctr, root_ptr, &recWord);
788 yaz_log(YLOG_WARN, "No root for index XML record");
793 /* DOM filter style indexing */
794 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
795 xmlChar **dst_content)
797 if (0 == XML_STRCMP(attr->name, name) && attr->children
798 && attr->children->type == XML_TEXT_NODE)
800 *dst_content = (attr->children->content);
807 /* DOM filter style indexing */
808 static void index_value_of(struct filter_info *tinfo,
809 struct recExtractCtrl *extctr,
813 xmlChar *text = xmlNodeGetContent(node);
814 size_t text_len = strlen((const char *)text);
817 /* if there is no text, we do not need to proceed */
820 xmlChar *look = index_p;
827 /* assingning text to be indexed */
829 (*extctr->init)(extctr, &recWord);
830 recWord.term_buf = (const char *)text;
831 recWord.term_len = text_len;
833 /* parsing all index name/type pairs */
834 /* may not start with ' ' or ':' */
835 while (*look && ' ' != *look && ':' != *look){
837 /* setting name and type to zero */
841 /* parsing one index name */
843 while (*look && ':' != *look && ' ' != *look){
847 strncpy((char *)index, (const char *)bval, eval - bval);
848 index[eval - bval] = '\0';
851 /* parsing one index type, if existing */
856 while (*look && ' ' != *look){
860 strncpy((char *)type, (const char *)bval, eval - bval);
861 type[eval - bval] = '\0';
864 /* actually indexing the text given */
865 /* printf("INDEX '%s:%s' '%s'\n", index, type, text); */
867 recWord.index_name = (const char *)index;
869 recWord.index_type = *type;
870 (extctr->tokenAdd)(&recWord);
872 /* eat whitespaces */
873 if (*look && ' ' == *look && *(look+1)){
883 /* DOM filter style indexing */
884 static void set_record_info(struct filter_info *tinfo,
885 struct recExtractCtrl *extctr,
890 printf("RECORD id=%s rank=%s type=%s\n", id_p, rank_p, type_p);
893 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
896 extctr->staticrank = atozint((const char *)rank_p);
898 /* if (!strcmp("update", type_str)) */
899 /* index_node(tinfo, ctrl, ptr, recWord); */
900 /* else if (!strcmp("delete", type_str)) */
901 /* yaz_log(YLOG_WARN, "dom filter delete: to be implemented"); */
903 /* yaz_log(YLOG_WARN, "dom filter: unknown record type '%s'", */
909 /* DOM filter style indexing */
910 static void process_xml_element_zebra_node(struct filter_info *tinfo,
911 struct recExtractCtrl *extctr,
914 if (node->type == XML_ELEMENT_NODE
915 && node->ns && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns)){
917 if (0 == XML_STRCMP(node->name, "index")){
918 xmlChar *index_p = 0;
920 struct _xmlAttr *attr;
921 for (attr = node->properties; attr; attr = attr->next){
922 if (attr_content_xml(attr, "name", &index_p)){
923 index_value_of(tinfo, extctr, node, index_p);
926 // printf("%s: dom filter: s% bad attribute %s",
927 // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
928 printf("dom filter: %s bad attribute @%s, "
930 xmlGetNodePath(node), attr->name);
933 else if (0 == XML_STRCMP(node->name, "record")){
938 struct _xmlAttr *attr;
939 for (attr = node->properties; attr; attr = attr->next){
940 if (attr_content_xml(attr, "id", &id_p))
942 else if (attr_content_xml(attr, "rank", &rank_p))
944 else if (attr_content_xml(attr, "type", &type_p))
947 // printf("%s: dom filter: s% bad attribute %s",
948 // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
949 printf("dom filter: %s bad attribute @%s,"
950 " expected @id|@rank|@type\n",
951 xmlGetNodePath(node), attr->name);
953 if (type_p && 0 != strcmp("update", (const char *)type_p))
954 printf("dom filter: %s attribute @%s,"
955 " only implemented '@type=\"update\"\n",
956 xmlGetNodePath(node), attr->name);
960 set_record_info(tinfo, extctr, id_p, rank_p, type_p);
962 // printf("%s: dom filter: s% bad attribute %s",
963 // tinfo->fname, xmlGetNodePath(node)), nodeattr->name);
964 printf("dom filter: %s bad element <%s>,"
965 " expected <record>|<index> in namespace '%s'\n",
966 xmlGetNodePath(node), node->name, zebra_dom_ns);
973 /* DOM filter style indexing */
974 static void process_xml_pi_node(struct filter_info *tinfo,
975 struct recExtractCtrl *extctr,
980 /* printf("PI %s\n", xmlGetNodePath(node)); */
982 /* if right PI name, continue parsing PI */
983 if (0 == strcmp(zebra_pi_name, (const char *)node->name)){
984 xmlChar *pi_p = node->content;
985 xmlChar *look = pi_p;
990 /* parsing PI record instructions */
991 if (0 == strncmp((const char *)look, "record", 6)){
1002 /* eat whitespace */
1003 while (*look && ' ' == *look && *(look+1))
1006 /* parse possible id */
1007 if (*look && 0 == strncmp((const char *)look, "id=", 3)){
1010 while (*look && ' ' != *look)
1013 strncpy((char *)id, (const char *)bval, eval - bval);
1014 id[eval - bval] = '\0';
1017 /* eat whitespace */
1018 while (*look && ' ' == *look && *(look+1))
1021 /* parse possible rank */
1022 if (*look && 0 == strncmp((const char *)look, "rank=", 5)){
1025 while (*look && ' ' != *look)
1028 strncpy((char *)rank, (const char *)bval, eval - bval);
1029 rank[eval - bval] = '\0';
1032 /* eat whitespace */
1033 while (*look && ' ' == *look && *(look+1))
1036 if (look && '\0' != *look){
1037 printf ("ERROR %s: content '%s'; can not parse '%s'\n",
1038 xmlGetNodePath(node), pi_p, look);
1040 /* set_record_info(id, rank, type); */
1041 set_record_info(tinfo, extctr, id, rank, 0);
1046 /* parsing index instruction */
1047 else if (0 == strncmp((const char *)look, "index", 5)){
1050 /* eat whitespace */
1051 while (*look && ' ' == *look && *(look+1))
1054 /* export index instructions to outside */
1057 /* nor record, neither index */
1060 printf ("ERROR %s: content '%s'; can not parse '%s'\n",
1061 xmlGetNodePath(node), pi_p, look);
1066 /* DOM filter style indexing */
1067 static void process_xml_element_node(struct filter_info *tinfo,
1068 struct recExtractCtrl *extctr,
1071 /* remember indexing instruction from PI to next element node */
1072 xmlChar *index_p = 0;
1074 /* printf("ELEM %s\n", xmlGetNodePath(node)); */
1076 /* check if we are an element node in the special zebra namespace
1077 and either set record data or index value-of node content*/
1078 process_xml_element_zebra_node(tinfo, extctr, node);
1080 /* loop through kid nodes */
1081 for (node = node->children; node; node = node->next)
1083 /* check and set PI record and index index instructions */
1084 if (node->type == XML_PI_NODE){
1085 process_xml_pi_node(tinfo, extctr, node, &index_p);
1087 else if (node->type == XML_ELEMENT_NODE){
1088 /* if there was a PI index instruction before this element */
1090 index_value_of(tinfo, extctr, node, index_p);
1093 process_xml_element_node(tinfo, extctr, node);
1101 /* DOM filter style indexing */
1102 static void extract_dom_doc_node(struct filter_info *tinfo,
1103 struct recExtractCtrl *extctr,
1106 /* printf("DOC %s\n", xmlGetNodePath((xmlNodePtr)doc)); */
1110 if (extctr->flagShowRecords){
1111 xmlDocDumpMemory(doc, &buf_out, &len_out);
1112 fwrite(buf_out, len_out, 1, stdout);
1116 process_xml_element_node(tinfo, extctr, (xmlNodePtr)doc);
1122 static int convert_extract_doc(struct filter_info *tinfo,
1123 struct filter_input *input,
1124 struct recExtractCtrl *p,
1130 const char *params[10];
1131 xsltStylesheetPtr last_xsp = 0;
1132 xmlDocPtr store_doc = 0;
1135 set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1137 /* input conversion */
1138 perform_convert(tinfo, input->convert, params, &doc, 0);
1142 /* store conversion */
1143 store_doc = xmlCopyDoc(doc, 1);
1144 perform_convert(tinfo, tinfo->store->convert,
1145 params, &store_doc, &last_xsp);
1149 xsltSaveResultToString(&buf_out, &len_out,
1150 store_doc ? store_doc : doc, last_xsp);
1152 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1153 if (p->flagShowRecords)
1154 fwrite(buf_out, len_out, 1, stdout);
1155 (*p->setStoreData)(p, buf_out, len_out);
1159 xmlFreeDoc(store_doc);
1161 /* extract conversion */
1162 perform_convert(tinfo, tinfo->extract->convert, params, &doc, 0);
1164 /* finally, do the indexing */
1166 extract_dom_doc_node(tinfo, p, doc);
1167 /* extract_doc_alvis(tinfo, p, doc); */
1171 return RECCTRL_EXTRACT_OK;
1174 static int extract_xml_split(struct filter_info *tinfo,
1175 struct filter_input *input,
1176 struct recExtractCtrl *p)
1180 if (p->first_record)
1182 if (input->u.xmlreader.reader)
1183 xmlFreeTextReader(input->u.xmlreader.reader);
1184 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1185 p /* I/O handler */,
1191 if (!input->u.xmlreader.reader)
1192 return RECCTRL_EXTRACT_ERROR_GENERIC;
1194 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1197 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1198 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1199 if (type == XML_READER_TYPE_ELEMENT &&
1200 input->u.xmlreader.split_level == depth)
1203 = xmlTextReaderExpand(input->u.xmlreader.reader);
1206 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1207 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1209 xmlDocSetRootElement(doc, ptr2);
1211 return convert_extract_doc(tinfo, input, p, doc);
1215 xmlFreeTextReader(input->u.xmlreader.reader);
1216 input->u.xmlreader.reader = 0;
1217 return RECCTRL_EXTRACT_ERROR_GENERIC;
1220 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1222 xmlFreeTextReader(input->u.xmlreader.reader);
1223 input->u.xmlreader.reader = 0;
1224 return RECCTRL_EXTRACT_EOF;
1227 static int extract_xml_full(struct filter_info *tinfo,
1228 struct filter_input *input,
1229 struct recExtractCtrl *p)
1231 if (p->first_record) /* only one record per stream */
1233 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1234 p /* I/O handler */,
1237 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1240 return RECCTRL_EXTRACT_ERROR_GENERIC;
1242 return convert_extract_doc(tinfo, input, p, doc);
1245 return RECCTRL_EXTRACT_EOF;
1248 static int extract_iso2709(struct filter_info *tinfo,
1249 struct filter_input *input,
1250 struct recExtractCtrl *p)
1256 if (p->stream->readf(p->stream, buf, 5) != 5)
1257 return RECCTRL_EXTRACT_EOF;
1258 while (*buf < '0' || *buf > '9')
1262 yaz_log(YLOG_WARN, "MARC: Skipping bad byte %d (0x%02X)",
1263 *buf & 0xff, *buf & 0xff);
1264 for (i = 0; i<4; i++)
1267 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1268 return RECCTRL_EXTRACT_EOF;
1270 record_length = atoi_n (buf, 5);
1271 if (record_length < 25)
1273 yaz_log (YLOG_WARN, "MARC record length < 25, is %d",
1275 return RECCTRL_EXTRACT_ERROR_GENERIC;
1277 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1278 if (read_bytes < record_length-5)
1280 yaz_log (YLOG_WARN, "Couldn't read whole MARC record");
1281 return RECCTRL_EXTRACT_ERROR_GENERIC;
1283 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1284 if (r < record_length)
1286 yaz_log (YLOG_WARN, "Parsing of MARC record failed r=%d length=%d",
1288 return RECCTRL_EXTRACT_ERROR_GENERIC;
1294 yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1295 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1296 xmlDocSetRootElement(rdoc, root_ptr);
1297 return convert_extract_doc(tinfo, input, p, rdoc);
1299 return RECCTRL_EXTRACT_OK;
1302 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1304 struct filter_info *tinfo = clientData;
1305 struct filter_input *input = tinfo->input_list;
1308 return RECCTRL_EXTRACT_ERROR_GENERIC;
1310 odr_reset(tinfo->odr_record);
1313 case DOM_INPUT_XMLREADER:
1314 if (input->u.xmlreader.split_level == 0)
1315 return extract_xml_full(tinfo, input, p);
1317 return extract_xml_split(tinfo, input, p);
1319 case DOM_INPUT_MARC:
1320 return extract_iso2709(tinfo, input, p);
1322 return RECCTRL_EXTRACT_ERROR_GENERIC;
1325 static int ioread_ret(void *context, char *buffer, int len)
1327 struct recRetrieveCtrl *p = context;
1328 return p->stream->readf(p->stream, buffer, len);
1331 static int ioclose_ret(void *context)
1336 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1338 /* const char *esn = zebra_dom_ns; */
1339 const char *esn = 0;
1340 const char *params[32];
1341 struct filter_info *tinfo = clientData;
1343 struct filter_retrieve *retrieve;
1344 xsltStylesheetPtr last_xsp = 0;
1348 if (p->comp->which == Z_RecordComp_simple
1349 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1351 esn = p->comp->u.simple->u.generic;
1353 else if (p->comp->which == Z_RecordComp_complex
1354 && p->comp->u.complex->generic->elementSpec
1355 && p->comp->u.complex->generic->elementSpec->which ==
1356 Z_ElementSpec_elementSetName)
1358 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1361 retrieve = lookup_retrieve(tinfo, esn);
1365 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1370 set_param_int(params, "id", p->localno, p->odr);
1372 set_param_str(params, "filename", p->fname, p->odr);
1373 if (p->staticrank >= 0)
1374 set_param_int(params, "rank", p->staticrank, p->odr);
1377 set_param_str(params, "schema", esn, p->odr);
1380 set_param_str(params, "schema", retrieve->name, p->odr);
1381 else if (retrieve->identifier)
1382 set_param_str(params, "schema", retrieve->identifier, p->odr);
1384 set_param_str(params, "schema", "", p->odr);
1387 set_param_int(params, "score", p->score, p->odr);
1388 set_param_int(params, "size", p->recordSize, p->odr);
1390 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1393 XML_PARSE_XINCLUDE|XML_PARSE_NOENT);
1396 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1400 /* retrieve conversion */
1401 perform_convert(tinfo, retrieve->convert, params, &doc, &last_xsp);
1404 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1406 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1412 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1414 xmlDocDumpMemory(doc, &buf_out, &len_out);
1416 p->output_format = VAL_TEXT_XML;
1417 p->rec_len = len_out;
1418 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1419 memcpy(p->rec_buf, buf_out, p->rec_len);
1422 else if (p->output_format == VAL_SUTRS)
1428 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1430 xmlDocDumpMemory(doc, &buf_out, &len_out);
1432 p->output_format = VAL_SUTRS;
1433 p->rec_len = len_out;
1434 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1435 memcpy(p->rec_buf, buf_out, p->rec_len);
1441 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1447 static struct recType filter_type = {
1458 #ifdef IDZEBRA_STATIC_DOM
1471 * indent-tabs-mode: nil
1473 * vim: shiftwidth=4 tabstop=8 expandtab