1 /* This file is part of the Zebra server.
2 Copyright (C) 1995-2008 Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include <yaz/diagbib1.h>
26 #include <yaz/tpath.h>
27 #include <yaz/snprintf.h>
29 #include <libxml/xmlversion.h>
30 #include <libxml/parser.h>
31 #include <libxml/tree.h>
32 #include <libxml/xmlIO.h>
33 #include <libxml/xmlreader.h>
34 #include <libxslt/transform.h>
35 #include <libxslt/xsltutils.h>
38 #include <libexslt/exslt.h>
41 #include <idzebra/util.h>
42 #include <idzebra/recctrl.h>
43 #include <yaz/oid_db.h>
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
59 const char *stylesheet;
60 xsltStylesheetPtr stylesheet_xsp;
68 enum convert_type which;
70 struct convert_xslt xslt;
71 struct convert_meta meta;
73 struct convert_s *next;
76 struct filter_extract {
78 struct convert_s *convert;
82 struct convert_s *convert;
85 struct filter_retrieve {
87 const char *identifier;
88 struct convert_s *convert;
89 struct filter_retrieve *next;
92 #define DOM_INPUT_XMLREADER 1
93 #define DOM_INPUT_MARC 2
97 struct convert_s *convert;
101 xmlTextReaderPtr reader;
105 const char *input_charset;
110 struct filter_input *next;
116 const char *profile_path;
119 xmlDocPtr doc_config;
120 struct filter_extract *extract;
121 struct filter_retrieve *retrieve_list;
122 struct filter_input *input_list;
123 struct filter_store *store;
124 int record_info_invoked;
129 #define XML_STRCMP(a,b) strcmp((char*)a, b)
130 #define XML_STRLEN(a) strlen((char*)a)
133 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
135 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
136 const char *fmt, ...)
138 __attribute__ ((format (printf, 4, 5)))
142 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
143 const char *fmt, ...)
149 yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
152 yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
153 xmlGetLineNo(ptr), buf);
157 yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
163 static void set_param_str(const char **params, const char *name,
164 const char *value, NMEM nmem)
166 char *quoted = nmem_malloc(nmem, 3 + strlen(value));
167 sprintf(quoted, "'%s'", value);
175 static void set_param_int(const char **params, const char *name,
176 zint value, NMEM nmem)
178 char *quoted = nmem_malloc(nmem, 30); /* 25 digits enough for 2^64 */
181 sprintf(quoted, "'" ZINT_FORMAT "'", value);
187 static void *filter_init(Res res, RecType recType)
189 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
191 tinfo->full_name = 0;
192 tinfo->profile_path = 0;
193 tinfo->nmem_record = nmem_create();
194 tinfo->nmem_config = nmem_create();
196 tinfo->retrieve_list = 0;
197 tinfo->input_list = 0;
199 tinfo->doc_config = 0;
200 tinfo->record_info_invoked = 0;
209 static int attr_content(struct _xmlAttr *attr, const char *name,
210 const char **dst_content)
212 if (!XML_STRCMP(attr->name, name) && attr->children
213 && attr->children->type == XML_TEXT_NODE)
215 *dst_content = (const char *)(attr->children->content);
221 static void destroy_xsp(struct convert_s *c)
225 if (c->which == convert_xslt_type)
227 if (c->u.xslt.stylesheet_xsp)
228 xsltFreeStylesheet(c->u.xslt.stylesheet_xsp);
234 static void destroy_dom(struct filter_info *tinfo)
238 destroy_xsp(tinfo->extract->convert);
243 destroy_xsp(tinfo->store->convert);
246 if (tinfo->input_list)
248 struct filter_input *i_ptr;
249 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
253 case DOM_INPUT_XMLREADER:
254 if (i_ptr->u.xmlreader.reader)
255 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
258 yaz_iconv_close(i_ptr->u.marc.iconv);
259 yaz_marc_destroy(i_ptr->u.marc.handle);
262 destroy_xsp(i_ptr->convert);
264 tinfo->input_list = 0;
266 if (tinfo->retrieve_list)
268 struct filter_retrieve *r_ptr;
269 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
270 destroy_xsp(r_ptr->convert);
271 tinfo->retrieve_list = 0;
274 if (tinfo->doc_config)
276 xmlFreeDoc(tinfo->doc_config);
277 tinfo->doc_config = 0;
279 nmem_reset(tinfo->nmem_config);
282 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
283 struct convert_s **l)
286 FOR_EACH_ELEMENT(ptr) {
287 if (!XML_STRCMP(ptr->name, "xslt"))
289 struct _xmlAttr *attr;
290 struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p));
293 p->which = convert_xslt_type;
294 p->u.xslt.stylesheet = 0;
295 p->u.xslt.stylesheet_xsp = 0;
297 for (attr = ptr->properties; attr; attr = attr->next)
298 if (attr_content(attr, "stylesheet", &p->u.xslt.stylesheet))
302 dom_log(YLOG_WARN, tinfo, ptr,
303 "bad attribute @%s", attr->name);
305 if (p->u.xslt.stylesheet)
307 char tmp_xslt_full_name[1024];
308 if (!yaz_filepath_resolve(p->u.xslt.stylesheet,
313 dom_log(YLOG_WARN, tinfo, 0,
314 "stylesheet %s not found in "
316 p->u.xslt.stylesheet,
317 tinfo->profile_path);
321 p->u.xslt.stylesheet_xsp
322 = xsltParseStylesheetFile((const xmlChar*)
324 if (!p->u.xslt.stylesheet_xsp)
326 dom_log(YLOG_WARN, tinfo, 0,
327 "could not parse xslt stylesheet %s",
334 dom_log(YLOG_WARN, tinfo, ptr,
335 "missing attribute 'stylesheet'");
341 else if (!XML_STRCMP(ptr->name, "process-meta"))
343 struct _xmlAttr *attr;
344 struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p));
347 p->which = convert_meta_type;
349 for (attr = ptr->properties; attr; attr = attr->next)
350 dom_log(YLOG_WARN, tinfo, ptr,
351 "bad attribute @%s", attr->name);
357 dom_log(YLOG_WARN, tinfo, ptr,
358 "bad element '%s', expected <xslt>", ptr->name);
365 static int process_meta(struct filter_info *tinfo, xmlDocPtr doc, xmlNodePtr node,
366 struct recRetrieveCtrl *retctr)
369 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href &&
370 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
372 if (0 == XML_STRCMP(node->name, "meta"))
374 const char *element_set_name = 0;
376 struct _xmlAttr *attr;
377 for (attr = node->properties; attr; attr = attr->next)
379 if (attr_content(attr, "name", &element_set_name))
383 dom_log(YLOG_WARN, tinfo, node,
384 "bad attribute @%s, expected @name", attr->name);
387 if (element_set_name)
389 WRBUF result = wrbuf_alloc();
390 WRBUF addinfo = wrbuf_alloc();
391 const Odr_oid *input_format = yaz_oid_recsyn_xml;
392 const Odr_oid *output_format = 0;
395 ret = retctr->special_fetch(retctr->handle,
397 input_format, &output_format,
402 xmlParseMemory(wrbuf_buf(result), wrbuf_len(result));
405 xmlNodePtr t = xmlDocGetRootElement(sub_doc);
406 xmlReplaceNode(node, xmlCopyNode(t, 1));
410 wrbuf_destroy(result);
411 wrbuf_destroy(addinfo);
415 for (node = node->children; node; node = node->next)
416 process_meta(tinfo, doc, node, retctr);
420 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
421 struct recExtractCtrl *extctr,
422 struct recRetrieveCtrl *retctr,
423 struct convert_s *convert,
426 xsltStylesheetPtr *last_xsp)
428 for (; convert; convert = convert->next)
430 if (convert->which == convert_xslt_type)
432 xmlChar *buf_out = 0;
434 xmlDocPtr res_doc = xsltApplyStylesheet(convert->u.xslt.stylesheet_xsp,
437 *last_xsp = convert->u.xslt.stylesheet_xsp;
442 /* now saving into buffer and re-reading into DOM to avoid annoing
443 XSLT problem with thrown-out indentation text nodes */
444 xsltSaveResultToString(&buf_out, &len_out, res_doc,
445 convert->u.xslt.stylesheet_xsp);
450 *doc = xmlParseMemory((const char *) buf_out, len_out);
452 /* writing debug info out */
453 if (extctr && extctr->flagShowRecords)
454 yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
455 tinfo->fname ? tinfo->fname : "(none)",
456 convert->u.xslt.stylesheet,
461 else if (convert->which == convert_meta_type)
463 if (retctr) /* only execute meta on retrieval */
465 process_meta(tinfo, *doc, xmlDocGetRootElement(*doc), retctr);
467 /* last stylesheet absent */
476 static struct filter_input *new_input(struct filter_info *tinfo, int type)
478 struct filter_input *p;
479 struct filter_input **np = &tinfo->input_list;
480 for (;*np; np = &(*np)->next)
482 p = *np = nmem_malloc(tinfo->nmem_config, sizeof(*p));
491 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
492 const char *syntax, const char *name)
494 FOR_EACH_ELEMENT(ptr) {
495 if (!XML_STRCMP(ptr->name, "marc"))
497 yaz_iconv_t iconv = 0;
498 const char *input_charset = "marc-8";
499 struct _xmlAttr *attr;
501 for (attr = ptr->properties; attr; attr = attr->next)
503 if (attr_content(attr, "inputcharset", &input_charset))
507 dom_log(YLOG_WARN, tinfo, ptr,
508 "bad attribute @%s, expected @inputcharset",
512 iconv = yaz_iconv_open("utf-8", input_charset);
515 dom_log(YLOG_WARN, tinfo, ptr,
516 "unsupported @charset '%s'", input_charset);
521 struct filter_input *p
522 = new_input(tinfo, DOM_INPUT_MARC);
523 p->u.marc.handle = yaz_marc_create();
524 p->u.marc.iconv = iconv;
526 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
530 parse_convert(tinfo, ptr, &p->convert);
535 else if (!XML_STRCMP(ptr->name, "xmlreader"))
537 struct filter_input *p
538 = new_input(tinfo, DOM_INPUT_XMLREADER);
539 struct _xmlAttr *attr;
540 const char *level_str = 0;
542 p->u.xmlreader.split_level = 0;
543 p->u.xmlreader.reader = 0;
545 for (attr = ptr->properties; attr; attr = attr->next)
547 if (attr_content(attr, "level", &level_str))
551 dom_log(YLOG_WARN, tinfo, ptr,
552 "bad attribute @%s, expected @level",
557 p->u.xmlreader.split_level = atoi(level_str);
561 parse_convert(tinfo, ptr, &p->convert);
566 dom_log(YLOG_WARN, tinfo, ptr,
567 "bad element <%s>, expected <marc>|<xmlreader>",
575 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
577 char tmp_full_name[1024];
581 tinfo->fname = nmem_strdup(tinfo->nmem_config, fname);
583 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
584 NULL, tmp_full_name))
585 tinfo->full_name = nmem_strdup(tinfo->nmem_config, tmp_full_name);
587 tinfo->full_name = nmem_strdup(tinfo->nmem_config, tinfo->fname);
589 yaz_log(YLOG_LOG, "%s dom filter: "
590 "loading config file %s", tinfo->fname, tinfo->full_name);
592 doc = xmlParseFile(tinfo->full_name);
595 yaz_log(YLOG_WARN, "%s: dom filter: "
596 "failed to parse config file %s",
597 tinfo->fname, tinfo->full_name);
600 /* save because we store ptrs to the content */
601 tinfo->doc_config = doc;
603 ptr = xmlDocGetRootElement(doc);
604 if (!ptr || ptr->type != XML_ELEMENT_NODE
605 || XML_STRCMP(ptr->name, "dom"))
607 dom_log(YLOG_WARN, tinfo, ptr,
608 "bad root element <%s>, expected root element <dom>",
614 FOR_EACH_ELEMENT(ptr) {
615 if (!XML_STRCMP(ptr->name, "extract"))
618 <extract name="index">
619 <xslt stylesheet="first.xsl"/>
620 <xslt stylesheet="second.xsl"/>
623 struct _xmlAttr *attr;
624 struct filter_extract *f =
625 nmem_malloc(tinfo->nmem_config, sizeof(*f));
630 for (attr = ptr->properties; attr; attr = attr->next)
632 if (attr_content(attr, "name", &f->name))
636 dom_log(YLOG_WARN, tinfo, ptr,
637 "bad attribute @%s, expected @name",
641 parse_convert(tinfo, ptr->children, &f->convert);
643 else if (!XML_STRCMP(ptr->name, "retrieve"))
647 <xslt stylesheet="some.xsl"/>
648 <xslt stylesheet="some.xsl"/>
651 struct _xmlAttr *attr;
652 struct filter_retrieve **fp = &tinfo->retrieve_list;
653 struct filter_retrieve *f =
654 nmem_malloc(tinfo->nmem_config, sizeof(*f));
665 for (attr = ptr->properties; attr; attr = attr->next)
667 if (attr_content(attr, "identifier",
670 else if (attr_content(attr, "name", &f->name))
674 dom_log(YLOG_WARN, tinfo, ptr,
675 "bad attribute @%s, expected @identifier|@name",
679 parse_convert(tinfo, ptr->children, &f->convert);
681 else if (!XML_STRCMP(ptr->name, "store"))
685 <xslt stylesheet="some.xsl"/>
686 <xslt stylesheet="some.xsl"/>
689 struct filter_store *f =
690 nmem_malloc(tinfo->nmem_config, sizeof(*f));
694 parse_convert(tinfo, ptr->children, &f->convert);
696 else if (!XML_STRCMP(ptr->name, "input"))
700 <xmlreader level="1"/>
702 <input syntax="usmarc">
703 <marc inputcharset="marc-8"/>
706 struct _xmlAttr *attr;
707 const char *syntax = 0;
708 const char *name = 0;
709 for (attr = ptr->properties; attr; attr = attr->next)
711 if (attr_content(attr, "syntax", &syntax))
713 else if (attr_content(attr, "name", &name))
717 dom_log(YLOG_WARN, tinfo, ptr,
718 "bad attribute @%s, expected @syntax|@name",
722 parse_input(tinfo, ptr->children, syntax, name);
726 dom_log(YLOG_WARN, tinfo, ptr,
728 "expected <extract>|<input>|<retrieve>|<store>",
733 if (!tinfo->input_list)
735 struct filter_input *p
736 = new_input(tinfo, DOM_INPUT_XMLREADER);
737 p->u.xmlreader.split_level = 0;
738 p->u.xmlreader.reader = 0;
743 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
746 struct filter_retrieve *f = tinfo->retrieve_list;
748 /* return first schema if no est is provided */
751 for (; f; f = f->next)
753 /* find requested schema */
756 if (f->identifier && !strcmp(f->identifier, est))
758 if (f->name && !strcmp(f->name, est))
765 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
767 struct filter_info *tinfo = clientData;
770 yaz_log(YLOG_WARN, "dom filter: need config file");
774 if (tinfo->fname && !strcmp(args, tinfo->fname))
777 tinfo->profile_path = res_get(res, "profilePath");
780 return parse_dom(tinfo, args);
783 static void filter_destroy(void *clientData)
785 struct filter_info *tinfo = clientData;
787 nmem_destroy(tinfo->nmem_config);
788 nmem_destroy(tinfo->nmem_record);
792 static int ioread_ex(void *context, char *buffer, int len)
794 struct recExtractCtrl *p = context;
795 return p->stream->readf(p->stream, buffer, len);
798 static int ioclose_ex(void *context)
805 /* DOM filter style indexing */
806 static void index_value_of(struct filter_info *tinfo,
807 struct recExtractCtrl *extctr,
812 if (tinfo->record_info_invoked == 1)
814 xmlChar *text = xmlNodeGetContent(node);
815 size_t text_len = strlen((const char *)text);
817 /* if there is no text, we do not need to proceed */
820 /* keep seqno base so that all text will have
821 identical seqno's for multiple fields , e.g
822 <z:index name="title:w any:w title:p">.. */
824 zint seqno_base = recword->seqno;
825 zint seqno_max = recword->seqno;
828 const char *look = index_p;
835 /* assingning text to be indexed */
836 recword->term_buf = (const char *)text;
837 recword->term_len = text_len;
839 /* parsing all index name/type pairs */
840 /* may not start with ' ' or ':' */
841 while (*look && ' ' != *look && ':' != *look)
843 /* setting name and type to zero */
847 /* parsing one index name */
849 while (*look && ':' != *look && ' ' != *look)
854 strncpy((char *)index, (const char *)bval, eval - bval);
855 index[eval - bval] = '\0';
858 /* parsing one index type, if existing */
864 while (*look && ' ' != *look)
869 strncpy((char *)type, (const char *)bval, eval - bval);
870 type[eval - bval] = '\0';
873 /* actually indexing the text given */
875 recword->seqno = seqno_base;
876 recword->index_name = (const char *)index;
878 recword->index_type = (const char *) type;
880 /* writing debug out */
881 if (extctr->flagShowRecords)
882 dom_log(YLOG_LOG, tinfo, 0,
883 "INDEX '%s:%s' '%s'",
884 (const char *) index,
886 (const char *) text);
888 (extctr->tokenAdd)(recword);
890 if (seqno_max < recword->seqno)
891 seqno_max = recword->seqno;
893 /* eat whitespaces */
894 if (*look && ' ' == *look)
899 recword->seqno = seqno_max;
906 /* DOM filter style indexing */
907 static void set_record_info(struct filter_info *tinfo,
908 struct recExtractCtrl *extctr,
914 /* writing debug info out */
915 if (extctr && extctr->flagShowRecords)
916 dom_log(YLOG_LOG, tinfo, node,
917 "RECORD id=%s rank=%s type=%s",
918 id_p ? (const char *) id_p : "(null)",
919 rank_p ? (const char *) rank_p : "(null)",
920 type_p ? (const char *) type_p : "(null)");
924 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
926 if (rank_p && *rank_p)
927 extctr->staticrank = atozint((const char *)rank_p);
929 if (type_p && *type_p)
931 enum zebra_recctrl_action_t action = action_update;
932 if (!strcmp(type_p, "insert"))
933 action = action_insert;
934 else if (!strcmp(type_p, "delete"))
935 action = action_delete;
936 else if (!strcmp(type_p, "replace"))
937 action = action_replace;
938 else if (!strcmp(type_p, "update"))
939 action = action_update;
941 dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
942 extctr->action = action;
945 if (tinfo->record_info_invoked == 1)
947 /* warn about multiple only once */
948 dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
950 tinfo->record_info_invoked++;
955 /* DOM filter style indexing */
956 static void process_xml_element_zebra_node(struct filter_info *tinfo,
957 struct recExtractCtrl *extctr,
961 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
962 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
964 if (0 == XML_STRCMP(node->name, "index"))
966 const char *index_p = 0;
968 struct _xmlAttr *attr;
969 for (attr = node->properties; attr; attr = attr->next)
971 if (attr_content(attr, "name", &index_p))
973 index_value_of(tinfo, extctr, recword, node, index_p);
977 dom_log(YLOG_WARN, tinfo, node,
978 "bad attribute @%s, expected @name",
983 else if (0 == XML_STRCMP(node->name, "record"))
985 const char *id_p = 0;
986 const char *rank_p = 0;
987 const char *type_p = 0;
989 struct _xmlAttr *attr;
990 for (attr = node->properties; attr; attr = attr->next)
992 if (attr_content(attr, "id", &id_p))
994 else if (attr_content(attr, "rank", &rank_p))
996 else if (attr_content(attr, "type", &type_p))
1000 dom_log(YLOG_WARN, tinfo, node,
1001 "bad attribute @%s, expected @id|@rank|@type",
1005 set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
1009 dom_log(YLOG_WARN, tinfo, node,
1011 " expected <record>|<index> in namespace '%s'",
1012 node->name, zebra_dom_ns);
1017 static int attr_content_pi(const char **c_ptr, const char *name,
1018 char *value, size_t value_max)
1020 size_t name_len = strlen(name);
1021 const char *look = *c_ptr;
1025 while (*look && ' ' == *look)
1027 if (strlen(look) > name_len)
1029 if (look[name_len] == '=' && !memcmp(look, name, name_len))
1033 while (*look && ' ' != *look)
1035 if (i < value_max-1)
1043 while (*look && ' ' == *look)
1049 /* DOM filter style indexing */
1050 static void process_xml_pi_node(struct filter_info *tinfo,
1051 struct recExtractCtrl *extctr,
1053 const char **index_pp)
1055 /* if right PI name, continue parsing PI */
1056 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
1058 xmlChar *pi_p = node->content;
1059 const char *look = (const char *) node->content;
1061 /* parsing PI record instructions */
1062 if (0 == strncmp((const char *)look, "record", 6))
1073 if (attr_content_pi(&look, "id", id, sizeof(id)))
1075 else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
1077 else if (attr_content_pi(&look, "type", type, sizeof(type)))
1079 dom_log(YLOG_WARN, tinfo, node,
1080 "content '%s', can not parse '%s'",
1084 set_record_info(tinfo, extctr, node, id, rank, type);
1086 /* parsing index instruction */
1087 else if (0 == strncmp((const char *)look, "index", 5))
1091 /* eat whitespace */
1092 while (*look && ' ' == *look)
1095 /* export index instructions to outside */
1100 dom_log(YLOG_WARN, tinfo, node,
1101 "content '%s', can not parse '%s'",
1107 /* DOM filter style indexing */
1108 static void process_xml_element_node(struct filter_info *tinfo,
1109 struct recExtractCtrl *extctr,
1113 /* remember indexing instruction from PI to next element node */
1114 const char *index_p = 0;
1116 /* check if we are an element node in the special zebra namespace
1117 and either set record data or index value-of node content*/
1118 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1120 /* loop through kid nodes */
1121 for (node = node->children; node; node = node->next)
1123 /* check and set PI record and index index instructions */
1124 if (node->type == XML_PI_NODE)
1126 process_xml_pi_node(tinfo, extctr, node, &index_p);
1128 else if (node->type == XML_ELEMENT_NODE)
1130 /* if there was a PI index instruction before this element */
1133 index_value_of(tinfo, extctr, recword, node, index_p);
1136 process_xml_element_node(tinfo, extctr, recword,node);
1144 /* DOM filter style indexing */
1145 static void extract_dom_doc_node(struct filter_info *tinfo,
1146 struct recExtractCtrl *extctr,
1149 /* only need to do the initialization once, reuse recword for all terms */
1151 (*extctr->init)(extctr, &recword);
1153 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1157 static int convert_extract_doc(struct filter_info *tinfo,
1158 struct filter_input *input,
1159 struct recExtractCtrl *p,
1164 const char *params[10];
1165 xsltStylesheetPtr last_xsp = 0;
1167 /* per default do not ingest record */
1168 tinfo->record_info_invoked = 0;
1170 /* exit if empty document given */
1172 return RECCTRL_EXTRACT_SKIP;
1174 /* we actuallu have a document which needs to be processed further */
1176 set_param_str(params, "schema", zebra_dom_ns, tinfo->nmem_record);
1178 if (p && p->flagShowRecords)
1182 xmlDocDumpMemory(doc, &buf_out, &len_out);
1184 FILE *outf = fopen("extract.xml", "w");
1185 fwrite(buf_out, 1, len_out, outf);
1188 yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
1191 if (p->setStoreData)
1193 xmlDocPtr store_doc = 0;
1195 /* input conversion */
1196 perform_convert(tinfo, p, 0, input->convert, params, &doc, 0);
1200 /* store conversion */
1201 store_doc = xmlCopyDoc(doc, 1);
1202 perform_convert(tinfo, p, 0, tinfo->store->convert,
1203 params, &store_doc, &last_xsp);
1206 /* saving either store doc or original doc in case no store doc exists */
1208 xsltSaveResultToString(&buf_out, &len_out,
1209 store_doc ? store_doc : doc, last_xsp);
1211 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1213 if (p->setStoreData)
1214 (*p->setStoreData)(p, buf_out, len_out);
1217 xmlFreeDoc(store_doc);
1221 /* extract conversion */
1222 perform_convert(tinfo, p, 0, tinfo->extract->convert, params, &doc, 0);
1225 /* finally, do the indexing */
1227 extract_dom_doc_node(tinfo, p, doc);
1231 /* there was nothing to index, so there is no inserted/updated record */
1232 if (tinfo->record_info_invoked == 0)
1233 return RECCTRL_EXTRACT_SKIP;
1235 return RECCTRL_EXTRACT_OK;
1238 static int extract_xml_split(struct filter_info *tinfo,
1239 struct filter_input *input,
1240 struct recExtractCtrl *p)
1244 if (p->first_record)
1246 if (input->u.xmlreader.reader)
1247 xmlFreeTextReader(input->u.xmlreader.reader);
1248 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1249 p /* I/O handler */,
1256 if (!input->u.xmlreader.reader)
1257 return RECCTRL_EXTRACT_ERROR_GENERIC;
1259 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1262 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1263 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1265 if (type == XML_READER_TYPE_ELEMENT &&
1266 input->u.xmlreader.split_level == depth)
1270 /* per default do not ingest record */
1271 tinfo->record_info_invoked = 0;
1273 ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1276 /* we have a new document */
1278 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1279 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1281 xmlDocSetRootElement(doc, ptr2);
1283 /* writing debug info out */
1284 if (p->flagShowRecords)
1286 xmlChar *buf_out = 0;
1288 xmlDocDumpMemory(doc, &buf_out, &len_out);
1289 yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1290 tinfo->fname ? tinfo->fname : "(none)",
1291 depth, len_out, buf_out);
1295 return convert_extract_doc(tinfo, input, p, doc);
1299 xmlFreeTextReader(input->u.xmlreader.reader);
1300 input->u.xmlreader.reader = 0;
1301 return RECCTRL_EXTRACT_ERROR_GENERIC;
1304 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1306 xmlFreeTextReader(input->u.xmlreader.reader);
1307 input->u.xmlreader.reader = 0;
1308 return RECCTRL_EXTRACT_EOF;
1311 static int extract_xml_full(struct filter_info *tinfo,
1312 struct filter_input *input,
1313 struct recExtractCtrl *p)
1315 if (p->first_record) /* only one record per stream */
1317 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1318 p /* I/O handler */,
1326 return RECCTRL_EXTRACT_ERROR_GENERIC;
1328 return convert_extract_doc(tinfo, input, p, doc);
1331 return RECCTRL_EXTRACT_EOF;
1334 static int extract_iso2709(struct filter_info *tinfo,
1335 struct filter_input *input,
1336 struct recExtractCtrl *p)
1342 if (p->stream->readf(p->stream, buf, 5) != 5)
1343 return RECCTRL_EXTRACT_EOF;
1344 while (*buf < '0' || *buf > '9')
1348 dom_log(YLOG_WARN, tinfo, 0,
1349 "MARC: Skipping bad byte %d (0x%02X)",
1350 *buf & 0xff, *buf & 0xff);
1351 for (i = 0; i<4; i++)
1354 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1355 return RECCTRL_EXTRACT_EOF;
1357 record_length = atoi_n (buf, 5);
1358 if (record_length < 25)
1360 dom_log(YLOG_WARN, tinfo, 0,
1361 "MARC record length < 25, is %d", record_length);
1362 return RECCTRL_EXTRACT_ERROR_GENERIC;
1364 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1365 if (read_bytes < record_length-5)
1367 dom_log(YLOG_WARN, tinfo, 0,
1368 "couldn't read whole MARC record");
1369 return RECCTRL_EXTRACT_ERROR_GENERIC;
1371 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1372 if (r < record_length)
1374 dom_log (YLOG_WARN, tinfo, 0,
1375 "parsing of MARC record failed r=%d length=%d",
1377 return RECCTRL_EXTRACT_ERROR_GENERIC;
1383 yaz_marc_write_xml(input->u.marc.handle, &root_ptr,
1384 "http://www.loc.gov/MARC21/slim", 0, 0);
1385 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1386 xmlDocSetRootElement(rdoc, root_ptr);
1387 return convert_extract_doc(tinfo, input, p, rdoc);
1389 return RECCTRL_EXTRACT_OK;
1392 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1394 struct filter_info *tinfo = clientData;
1395 struct filter_input *input = tinfo->input_list;
1398 return RECCTRL_EXTRACT_ERROR_GENERIC;
1400 nmem_reset(tinfo->nmem_record);
1402 if (p->setStoreData == 0)
1403 return extract_xml_full(tinfo, input, p);
1406 case DOM_INPUT_XMLREADER:
1407 if (input->u.xmlreader.split_level == 0)
1408 return extract_xml_full(tinfo, input, p);
1410 return extract_xml_split(tinfo, input, p);
1412 case DOM_INPUT_MARC:
1413 return extract_iso2709(tinfo, input, p);
1415 return RECCTRL_EXTRACT_ERROR_GENERIC;
1418 static int ioread_ret(void *context, char *buffer, int len)
1420 struct recRetrieveCtrl *p = context;
1421 return p->stream->readf(p->stream, buffer, len);
1424 static int ioclose_ret(void *context)
1429 static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
1431 /* const char *esn = zebra_dom_ns; */
1432 const char *esn = 0;
1433 const char *params[32];
1434 struct filter_info *tinfo = clientData;
1436 struct filter_retrieve *retrieve;
1437 xsltStylesheetPtr last_xsp = 0;
1441 if (p->comp->which == Z_RecordComp_simple
1442 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1444 esn = p->comp->u.simple->u.generic;
1446 else if (p->comp->which == Z_RecordComp_complex
1447 && p->comp->u.complex->generic->elementSpec
1448 && p->comp->u.complex->generic->elementSpec->which ==
1449 Z_ElementSpec_elementSetName)
1451 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1454 retrieve = lookup_retrieve(tinfo, esn);
1458 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1459 p->addinfo = odr_strdup(p->odr, esn);
1464 set_param_int(params, "id", p->localno, p->odr->mem);
1466 set_param_str(params, "filename", p->fname, p->odr->mem);
1467 if (p->staticrank >= 0)
1468 set_param_int(params, "rank", p->staticrank, p->odr->mem);
1471 set_param_str(params, "schema", esn, p->odr->mem);
1474 set_param_str(params, "schema", retrieve->name, p->odr->mem);
1475 else if (retrieve->identifier)
1476 set_param_str(params, "schema", retrieve->identifier, p->odr->mem);
1478 set_param_str(params, "schema", "", p->odr->mem);
1481 set_param_int(params, "score", p->score, p->odr->mem);
1482 set_param_int(params, "size", p->recordSize, p->odr->mem);
1484 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1487 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1490 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1494 /* retrieve conversion */
1495 perform_convert(tinfo, 0, p, retrieve->convert, params, &doc, &last_xsp);
1498 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1500 else if (!p->input_format
1501 || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
1507 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1509 xmlDocDumpMemory(doc, &buf_out, &len_out);
1511 p->output_format = yaz_oid_recsyn_xml;
1512 p->rec_len = len_out;
1513 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1514 memcpy(p->rec_buf, buf_out, p->rec_len);
1517 else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
1523 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1525 xmlDocDumpMemory(doc, &buf_out, &len_out);
1527 p->output_format = yaz_oid_recsyn_sutrs;
1528 p->rec_len = len_out;
1529 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1530 memcpy(p->rec_buf, buf_out, p->rec_len);
1536 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1542 static struct recType filter_type = {
1553 #ifdef IDZEBRA_STATIC_DOM
1566 * indent-tabs-mode: nil
1568 * vim: shiftwidth=4 tabstop=8 expandtab