1 /* $Id: mod_dom.c,v 1.37 2007-05-19 19:44:14 adam Exp $
2 Copyright (C) 1995-2007
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 #include <yaz/diagbib1.h>
29 #include <yaz/tpath.h>
30 #include <yaz/snprintf.h>
32 #include <libxml/xmlversion.h>
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/xmlreader.h>
37 #include <libxslt/transform.h>
38 #include <libxslt/xsltutils.h>
41 #include <libexslt/exslt.h>
44 #include <idzebra/util.h>
45 #include <idzebra/recctrl.h>
46 #include <yaz/oid_db.h>
48 /* DOM filter style indexing */
49 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
50 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
52 /* DOM filter style indexing */
53 #define ZEBRA_PI_NAME "zebra-2.0"
54 static const char *zebra_pi_name = ZEBRA_PI_NAME;
59 const char *stylesheet;
60 xsltStylesheetPtr stylesheet_xsp;
61 struct convert_s *next;
64 struct filter_extract {
66 struct convert_s *convert;
70 struct convert_s *convert;
73 struct filter_retrieve {
75 const char *identifier;
76 struct convert_s *convert;
77 struct filter_retrieve *next;
80 #define DOM_INPUT_XMLREADER 1
81 #define DOM_INPUT_MARC 2
85 struct convert_s *convert;
89 xmlTextReaderPtr reader;
93 const char *input_charset;
98 struct filter_input *next;
104 const char *profile_path;
107 xmlDocPtr doc_config;
108 struct filter_extract *extract;
109 struct filter_retrieve *retrieve_list;
110 struct filter_input *input_list;
111 struct filter_store *store;
112 int record_info_invoked;
117 #define XML_STRCMP(a,b) strcmp((char*)a, b)
118 #define XML_STRLEN(a) strlen((char*)a)
121 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
123 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
124 const char *fmt, ...)
126 __attribute__ ((format (printf, 4, 5)))
130 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
131 const char *fmt, ...)
137 yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
140 yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
141 xmlGetLineNo(ptr), buf);
145 yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
151 static void set_param_str(const char **params, const char *name,
152 const char *value, ODR odr)
154 char *quoted = odr_malloc(odr, 3 + strlen(value));
155 sprintf(quoted, "'%s'", value);
163 static void set_param_int(const char **params, const char *name,
166 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
169 sprintf(quoted, "'" ZINT_FORMAT "'", value);
175 static void *filter_init(Res res, RecType recType)
177 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
179 tinfo->full_name = 0;
180 tinfo->profile_path = 0;
181 tinfo->odr_record = odr_createmem(ODR_ENCODE);
182 tinfo->odr_config = odr_createmem(ODR_ENCODE);
184 tinfo->retrieve_list = 0;
185 tinfo->input_list = 0;
187 tinfo->doc_config = 0;
188 tinfo->record_info_invoked = 0;
197 static int attr_content(struct _xmlAttr *attr, const char *name,
198 const char **dst_content)
200 if (!XML_STRCMP(attr->name, name) && attr->children
201 && attr->children->type == XML_TEXT_NODE)
203 *dst_content = (const char *)(attr->children->content);
209 static void destroy_xsp(struct convert_s *c)
213 if (c->stylesheet_xsp)
214 xsltFreeStylesheet(c->stylesheet_xsp);
219 static void destroy_dom(struct filter_info *tinfo)
223 destroy_xsp(tinfo->extract->convert);
228 destroy_xsp(tinfo->store->convert);
231 if (tinfo->input_list)
233 struct filter_input *i_ptr;
234 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
238 case DOM_INPUT_XMLREADER:
239 if (i_ptr->u.xmlreader.reader)
240 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
243 yaz_iconv_close(i_ptr->u.marc.iconv);
244 yaz_marc_destroy(i_ptr->u.marc.handle);
247 destroy_xsp(i_ptr->convert);
249 tinfo->input_list = 0;
251 if (tinfo->retrieve_list)
253 struct filter_retrieve *r_ptr;
254 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
255 destroy_xsp(r_ptr->convert);
256 tinfo->retrieve_list = 0;
259 if (tinfo->doc_config)
261 xmlFreeDoc(tinfo->doc_config);
262 tinfo->doc_config = 0;
264 odr_reset(tinfo->odr_config);
267 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
268 struct convert_s **l)
271 FOR_EACH_ELEMENT(ptr) {
272 if (!XML_STRCMP(ptr->name, "xslt"))
274 struct _xmlAttr *attr;
276 = odr_malloc(tinfo->odr_config, sizeof(*p));
280 p->stylesheet_xsp = 0;
282 for (attr = ptr->properties; attr; attr = attr->next)
283 if (attr_content(attr, "stylesheet", &p->stylesheet))
287 dom_log(YLOG_WARN, tinfo, ptr,
288 "bad attribute @%s", attr->name);
292 char tmp_xslt_full_name[1024];
293 if (!yaz_filepath_resolve(p->stylesheet,
298 dom_log(YLOG_WARN, tinfo, 0,
299 "stylesheet %s not found in "
302 tinfo->profile_path);
307 = xsltParseStylesheetFile((const xmlChar*)
309 if (!p->stylesheet_xsp)
311 dom_log(YLOG_WARN, tinfo, 0,
312 "could not parse xslt stylesheet %s",
319 dom_log(YLOG_WARN, tinfo, ptr,
320 "missing attribute 'stylesheet' ");
328 dom_log(YLOG_WARN, tinfo, ptr,
329 "bad element '%s', expected <xslt>", ptr->name);
336 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
337 struct recExtractCtrl *extctr,
338 struct convert_s *convert,
341 xsltStylesheetPtr *last_xsp)
343 for (; convert; convert = convert->next)
345 xmlChar *buf_out = 0;
347 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
350 *last_xsp = convert->stylesheet_xsp;
355 /* now saving into buffer and re-reading into DOM to avoid annoing
356 XSLT problem with thrown-out indentation text nodes */
357 xsltSaveResultToString(&buf_out, &len_out, res_doc,
358 convert->stylesheet_xsp);
363 *doc = xmlParseMemory((const char *) buf_out, len_out);
365 /* writing debug info out */
366 if (extctr && extctr->flagShowRecords)
367 yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
368 tinfo->fname ? tinfo->fname : "(none)",
377 static struct filter_input *new_input(struct filter_info *tinfo, int type)
379 struct filter_input *p;
380 struct filter_input **np = &tinfo->input_list;
381 for (;*np; np = &(*np)->next)
383 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
392 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
393 const char *syntax, const char *name)
395 FOR_EACH_ELEMENT(ptr) {
396 if (!XML_STRCMP(ptr->name, "marc"))
398 yaz_iconv_t iconv = 0;
399 const char *input_charset = "marc-8";
400 struct _xmlAttr *attr;
402 for (attr = ptr->properties; attr; attr = attr->next)
404 if (attr_content(attr, "inputcharset", &input_charset))
408 dom_log(YLOG_WARN, tinfo, ptr,
409 "bad attribute @%s, expected @inputcharset",
413 iconv = yaz_iconv_open("utf-8", input_charset);
416 dom_log(YLOG_WARN, tinfo, ptr,
417 "unsupported @charset '%s'", input_charset);
422 struct filter_input *p
423 = new_input(tinfo, DOM_INPUT_MARC);
424 p->u.marc.handle = yaz_marc_create();
425 p->u.marc.iconv = iconv;
427 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
431 parse_convert(tinfo, ptr, &p->convert);
436 else if (!XML_STRCMP(ptr->name, "xmlreader"))
438 struct filter_input *p
439 = new_input(tinfo, DOM_INPUT_XMLREADER);
440 struct _xmlAttr *attr;
441 const char *level_str = 0;
443 p->u.xmlreader.split_level = 0;
444 p->u.xmlreader.reader = 0;
446 for (attr = ptr->properties; attr; attr = attr->next)
448 if (attr_content(attr, "level", &level_str))
452 dom_log(YLOG_WARN, tinfo, ptr,
453 "bad attribute @%s, expected @level",
458 p->u.xmlreader.split_level = atoi(level_str);
462 parse_convert(tinfo, ptr, &p->convert);
467 dom_log(YLOG_WARN, tinfo, ptr,
468 "bad element <%s>, expected <marc>|<xmlreader>",
476 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
478 char tmp_full_name[1024];
482 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
484 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
485 NULL, tmp_full_name))
486 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
488 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
490 yaz_log(YLOG_LOG, "%s dom filter: "
491 "loading config file %s", tinfo->fname, tinfo->full_name);
493 doc = xmlParseFile(tinfo->full_name);
496 yaz_log(YLOG_WARN, "%s: dom filter: "
497 "failed to parse config file %s",
498 tinfo->fname, tinfo->full_name);
501 /* save because we store ptrs to the content */
502 tinfo->doc_config = doc;
504 ptr = xmlDocGetRootElement(doc);
505 if (!ptr || ptr->type != XML_ELEMENT_NODE
506 || XML_STRCMP(ptr->name, "dom"))
508 dom_log(YLOG_WARN, tinfo, ptr,
509 "bad root element <%s>, expected root element <dom>",
515 FOR_EACH_ELEMENT(ptr) {
516 if (!XML_STRCMP(ptr->name, "extract"))
519 <extract name="index">
520 <xslt stylesheet="first.xsl"/>
521 <xslt stylesheet="second.xsl"/>
524 struct _xmlAttr *attr;
525 struct filter_extract *f =
526 odr_malloc(tinfo->odr_config, sizeof(*f));
531 for (attr = ptr->properties; attr; attr = attr->next)
533 if (attr_content(attr, "name", &f->name))
537 dom_log(YLOG_WARN, tinfo, ptr,
538 "bad attribute @%s, expected @name",
542 parse_convert(tinfo, ptr->children, &f->convert);
544 else if (!XML_STRCMP(ptr->name, "retrieve"))
548 <xslt stylesheet="some.xsl"/>
549 <xslt stylesheet="some.xsl"/>
552 struct _xmlAttr *attr;
553 struct filter_retrieve **fp = &tinfo->retrieve_list;
554 struct filter_retrieve *f =
555 odr_malloc(tinfo->odr_config, sizeof(*f));
566 for (attr = ptr->properties; attr; attr = attr->next)
568 if (attr_content(attr, "identifier",
571 else if (attr_content(attr, "name", &f->name))
575 dom_log(YLOG_WARN, tinfo, ptr,
576 "bad attribute @%s, expected @identifier|@name",
580 parse_convert(tinfo, ptr->children, &f->convert);
582 else if (!XML_STRCMP(ptr->name, "store"))
586 <xslt stylesheet="some.xsl"/>
587 <xslt stylesheet="some.xsl"/>
590 struct filter_store *f =
591 odr_malloc(tinfo->odr_config, sizeof(*f));
595 parse_convert(tinfo, ptr->children, &f->convert);
597 else if (!XML_STRCMP(ptr->name, "input"))
601 <xmlreader level="1"/>
603 <input syntax="usmarc">
604 <marc inputcharset="marc-8"/>
607 struct _xmlAttr *attr;
608 const char *syntax = 0;
609 const char *name = 0;
610 for (attr = ptr->properties; attr; attr = attr->next)
612 if (attr_content(attr, "syntax", &syntax))
614 else if (attr_content(attr, "name", &name))
618 dom_log(YLOG_WARN, tinfo, ptr,
619 "bad attribute @%s, expected @syntax|@name",
623 parse_input(tinfo, ptr->children, syntax, name);
627 dom_log(YLOG_WARN, tinfo, ptr,
629 "expected <extract>|<input>|<retrieve>|<store>",
634 if (!tinfo->input_list)
636 struct filter_input *p
637 = new_input(tinfo, DOM_INPUT_XMLREADER);
638 p->u.xmlreader.split_level = 0;
639 p->u.xmlreader.reader = 0;
644 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
647 struct filter_retrieve *f = tinfo->retrieve_list;
649 /* return first schema if no est is provided */
652 for (; f; f = f->next)
654 /* find requested schema */
657 if (f->identifier && !strcmp(f->identifier, est))
659 if (f->name && !strcmp(f->name, est))
666 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
668 struct filter_info *tinfo = clientData;
671 yaz_log(YLOG_WARN, "dom filter: need config file");
675 if (tinfo->fname && !strcmp(args, tinfo->fname))
678 tinfo->profile_path = res_get(res, "profilePath");
681 return parse_dom(tinfo, args);
684 static void filter_destroy(void *clientData)
686 struct filter_info *tinfo = clientData;
688 odr_destroy(tinfo->odr_config);
689 odr_destroy(tinfo->odr_record);
693 static int ioread_ex(void *context, char *buffer, int len)
695 struct recExtractCtrl *p = context;
696 return p->stream->readf(p->stream, buffer, len);
699 static int ioclose_ex(void *context)
705 /* DOM filter style indexing */
706 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
707 const char **dst_content)
709 if (0 == XML_STRCMP(attr->name, name) && attr->children
710 && attr->children->type == XML_TEXT_NODE)
712 *dst_content = (const char *) (attr->children->content);
719 /* DOM filter style indexing */
720 static void index_value_of(struct filter_info *tinfo,
721 struct recExtractCtrl *extctr,
726 if (tinfo->record_info_invoked == 1)
728 xmlChar *text = xmlNodeGetContent(node);
729 size_t text_len = strlen((const char *)text);
731 /* if there is no text, we do not need to proceed */
734 const char *look = index_p;
741 /* assingning text to be indexed */
742 recword->term_buf = (const char *)text;
743 recword->term_len = text_len;
745 /* parsing all index name/type pairs */
746 /* may not start with ' ' or ':' */
747 while (*look && ' ' != *look && ':' != *look)
749 /* setting name and type to zero */
753 /* parsing one index name */
755 while (*look && ':' != *look && ' ' != *look)
760 strncpy((char *)index, (const char *)bval, eval - bval);
761 index[eval - bval] = '\0';
764 /* parsing one index type, if existing */
770 while (*look && ' ' != *look)
775 strncpy((char *)type, (const char *)bval, eval - bval);
776 type[eval - bval] = '\0';
779 /* actually indexing the text given */
781 recword->index_name = (const char *)index;
783 recword->index_type = *type;
785 /* writing debug out */
786 if (extctr->flagShowRecords)
787 dom_log(YLOG_LOG, tinfo, 0,
788 "INDEX '%s:%s' '%s'",
789 index ? (const char *) index : "null",
790 type ? (const char *) type : "null",
791 text ? (const char *) text : "null");
793 /* actually indexing the text given */
794 recword->index_name = (const char *)index;
796 recword->index_type = *type;
797 (extctr->tokenAdd)(recword);
799 /* eat whitespaces */
800 if (*look && ' ' == *look)
811 /* DOM filter style indexing */
812 static void set_record_info(struct filter_info *tinfo,
813 struct recExtractCtrl *extctr,
819 /* writing debug info out */
820 if (extctr && extctr->flagShowRecords)
821 dom_log(YLOG_LOG, tinfo, node,
822 "RECORD id=%s rank=%s type=%s",
823 id_p ? (const char *) id_p : "(null)",
824 rank_p ? (const char *) rank_p : "(null)",
825 type_p ? (const char *) type_p : "(null)");
829 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
831 if (rank_p && *rank_p)
832 extctr->staticrank = atozint((const char *)rank_p);
834 if (type_p && *type_p)
836 enum zebra_recctrl_action_t action = action_update;
837 if (!strcmp(type_p, "insert"))
838 action = action_insert;
839 else if (!strcmp(type_p, "delete"))
840 action = action_delete;
841 else if (!strcmp(type_p, "replace"))
842 action = action_replace;
843 else if (!strcmp(type_p, "update"))
844 action = action_update;
846 dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
847 extctr->action = action;
848 yaz_log(YLOG_LOG, "In mod_dom.c: setting action to %d", action);
851 if (tinfo->record_info_invoked == 1)
853 /* warn about multiple only once */
854 dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
856 tinfo->record_info_invoked++;
861 /* DOM filter style indexing */
862 static void process_xml_element_zebra_node(struct filter_info *tinfo,
863 struct recExtractCtrl *extctr,
867 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
868 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
870 if (0 == XML_STRCMP(node->name, "index"))
872 const char *index_p = 0;
874 struct _xmlAttr *attr;
875 for (attr = node->properties; attr; attr = attr->next)
877 if (attr_content_xml(attr, "name", &index_p))
879 index_value_of(tinfo, extctr, recword, node, index_p);
883 dom_log(YLOG_WARN, tinfo, node,
884 "bad attribute @%s, expected @name",
889 else if (0 == XML_STRCMP(node->name, "record"))
891 const char *id_p = 0;
892 const char *rank_p = 0;
893 const char *type_p = 0;
895 struct _xmlAttr *attr;
896 for (attr = node->properties; attr; attr = attr->next)
898 if (attr_content_xml(attr, "id", &id_p))
900 else if (attr_content_xml(attr, "rank", &rank_p))
902 else if (attr_content_xml(attr, "type", &type_p))
906 dom_log(YLOG_WARN, tinfo, node,
907 "bad attribute @%s, expected @id|@rank|@type",
911 set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
915 dom_log(YLOG_WARN, tinfo, node,
917 " expected <record>|<index> in namespace '%s'",
918 node->name, zebra_dom_ns);
923 static int attr_content_pi(const char **c_ptr, const char *name,
924 char *value, size_t value_max)
926 size_t name_len = strlen(name);
927 const char *look = *c_ptr;
931 while (*look && ' ' == *look)
933 if (strlen(look) > name_len)
935 if (look[name_len] == '=' && !memcmp(look, name, name_len))
939 while (*look && ' ' != *look)
949 while (*look && ' ' == *look)
955 /* DOM filter style indexing */
956 static void process_xml_pi_node(struct filter_info *tinfo,
957 struct recExtractCtrl *extctr,
959 const char **index_pp)
961 /* if right PI name, continue parsing PI */
962 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
964 xmlChar *pi_p = node->content;
965 const char *look = (const char *) node->content;
967 /* parsing PI record instructions */
968 if (0 == strncmp((const char *)look, "record", 6))
979 if (attr_content_pi(&look, "id", id, sizeof(id)))
981 else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
983 else if (attr_content_pi(&look, "type", type, sizeof(type)))
985 dom_log(YLOG_WARN, tinfo, node,
986 "content '%s', can not parse '%s'",
990 set_record_info(tinfo, extctr, node, id, rank, type);
992 /* parsing index instruction */
993 else if (0 == strncmp((const char *)look, "index", 5))
998 while (*look && ' ' == *look)
1001 /* export index instructions to outside */
1006 dom_log(YLOG_WARN, tinfo, node,
1007 "content '%s', can not parse '%s'",
1013 /* DOM filter style indexing */
1014 static void process_xml_element_node(struct filter_info *tinfo,
1015 struct recExtractCtrl *extctr,
1019 /* remember indexing instruction from PI to next element node */
1020 const char *index_p = 0;
1022 /* check if we are an element node in the special zebra namespace
1023 and either set record data or index value-of node content*/
1024 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1026 /* loop through kid nodes */
1027 for (node = node->children; node; node = node->next)
1029 /* check and set PI record and index index instructions */
1030 if (node->type == XML_PI_NODE)
1032 process_xml_pi_node(tinfo, extctr, node, &index_p);
1034 else if (node->type == XML_ELEMENT_NODE)
1036 /* if there was a PI index instruction before this element */
1039 index_value_of(tinfo, extctr, recword, node, index_p);
1042 process_xml_element_node(tinfo, extctr, recword,node);
1050 /* DOM filter style indexing */
1051 static void extract_dom_doc_node(struct filter_info *tinfo,
1052 struct recExtractCtrl *extctr,
1055 /* only need to do the initialization once, reuse recword for all terms */
1057 (*extctr->init)(extctr, &recword);
1059 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1065 static int convert_extract_doc(struct filter_info *tinfo,
1066 struct filter_input *input,
1067 struct recExtractCtrl *p,
1073 const char *params[10];
1074 xsltStylesheetPtr last_xsp = 0;
1075 xmlDocPtr store_doc = 0;
1077 /* per default do not ingest record */
1078 tinfo->record_info_invoked = 0;
1080 /* exit if empty document given */
1082 return RECCTRL_EXTRACT_SKIP;
1084 /* we actuallu have a document which needs to be processed further */
1086 set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1088 if (p && p->flagShowRecords)
1093 FILE *outf = fopen("extract.xml", "w");
1094 xmlDocDumpMemory(doc, &buf_out, &len_out);
1095 fwrite(buf_out, 1, len_out, outf);
1097 yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
1103 /* input conversion */
1104 perform_convert(tinfo, p, input->convert, params, &doc, 0);
1109 /* store conversion */
1110 store_doc = xmlCopyDoc(doc, 1);
1111 perform_convert(tinfo, p, tinfo->store->convert,
1112 params, &store_doc, &last_xsp);
1115 /* saving either store doc or original doc in case no store doc exists */
1117 xsltSaveResultToString(&buf_out, &len_out,
1118 store_doc ? store_doc : doc, last_xsp);
1120 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1122 (*p->setStoreData)(p, buf_out, len_out);
1126 xmlFreeDoc(store_doc);
1128 /* extract conversion */
1129 perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
1132 /* finally, do the indexing */
1134 extract_dom_doc_node(tinfo, p, doc);
1138 /* there was nothing to index, so there is no inserted/updated record */
1139 if (tinfo->record_info_invoked == 0)
1140 return RECCTRL_EXTRACT_SKIP;
1142 return RECCTRL_EXTRACT_OK;
1145 static int extract_xml_split(struct filter_info *tinfo,
1146 struct filter_input *input,
1147 struct recExtractCtrl *p)
1151 if (p->first_record)
1153 if (input->u.xmlreader.reader)
1154 xmlFreeTextReader(input->u.xmlreader.reader);
1155 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1156 p /* I/O handler */,
1163 if (!input->u.xmlreader.reader)
1164 return RECCTRL_EXTRACT_ERROR_GENERIC;
1166 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1169 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1170 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1172 if (type == XML_READER_TYPE_ELEMENT &&
1173 input->u.xmlreader.split_level == depth)
1177 /* per default do not ingest record */
1178 tinfo->record_info_invoked = 0;
1180 ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1183 /* we have a new document */
1185 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1186 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1188 xmlDocSetRootElement(doc, ptr2);
1190 /* writing debug info out */
1191 if (p->flagShowRecords)
1193 xmlChar *buf_out = 0;
1195 xmlDocDumpMemory(doc, &buf_out, &len_out);
1196 yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1197 tinfo->fname ? tinfo->fname : "(none)",
1198 depth, len_out, buf_out);
1202 return convert_extract_doc(tinfo, input, p, doc);
1206 xmlFreeTextReader(input->u.xmlreader.reader);
1207 input->u.xmlreader.reader = 0;
1208 return RECCTRL_EXTRACT_ERROR_GENERIC;
1211 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1213 xmlFreeTextReader(input->u.xmlreader.reader);
1214 input->u.xmlreader.reader = 0;
1215 return RECCTRL_EXTRACT_EOF;
1218 static int extract_xml_full(struct filter_info *tinfo,
1219 struct filter_input *input,
1220 struct recExtractCtrl *p)
1222 if (p->first_record) /* only one record per stream */
1224 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1225 p /* I/O handler */,
1233 return RECCTRL_EXTRACT_ERROR_GENERIC;
1235 return convert_extract_doc(tinfo, input, p, doc);
1238 return RECCTRL_EXTRACT_EOF;
1241 static int extract_iso2709(struct filter_info *tinfo,
1242 struct filter_input *input,
1243 struct recExtractCtrl *p)
1249 if (p->stream->readf(p->stream, buf, 5) != 5)
1250 return RECCTRL_EXTRACT_EOF;
1251 while (*buf < '0' || *buf > '9')
1255 dom_log(YLOG_WARN, tinfo, 0,
1256 "MARC: Skipping bad byte %d (0x%02X)",
1257 *buf & 0xff, *buf & 0xff);
1258 for (i = 0; i<4; i++)
1261 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1262 return RECCTRL_EXTRACT_EOF;
1264 record_length = atoi_n (buf, 5);
1265 if (record_length < 25)
1267 dom_log(YLOG_WARN, tinfo, 0,
1268 "MARC record length < 25, is %d", record_length);
1269 return RECCTRL_EXTRACT_ERROR_GENERIC;
1271 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1272 if (read_bytes < record_length-5)
1274 dom_log(YLOG_WARN, tinfo, 0,
1275 "couldn't read whole MARC record");
1276 return RECCTRL_EXTRACT_ERROR_GENERIC;
1278 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1279 if (r < record_length)
1281 dom_log (YLOG_WARN, tinfo, 0,
1282 "parsing of MARC record failed r=%d length=%d",
1284 return RECCTRL_EXTRACT_ERROR_GENERIC;
1290 yaz_marc_write_xml(input->u.marc.handle, &root_ptr,
1291 "http://www.loc.gov/MARC21/slim", 0, 0);
1292 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1293 xmlDocSetRootElement(rdoc, root_ptr);
1294 return convert_extract_doc(tinfo, input, p, rdoc);
1296 return RECCTRL_EXTRACT_OK;
1299 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1301 struct filter_info *tinfo = clientData;
1302 struct filter_input *input = tinfo->input_list;
1305 return RECCTRL_EXTRACT_ERROR_GENERIC;
1307 odr_reset(tinfo->odr_record);
1310 case DOM_INPUT_XMLREADER:
1311 if (input->u.xmlreader.split_level == 0)
1312 return extract_xml_full(tinfo, input, p);
1314 return extract_xml_split(tinfo, input, p);
1316 case DOM_INPUT_MARC:
1317 return extract_iso2709(tinfo, input, p);
1319 return RECCTRL_EXTRACT_ERROR_GENERIC;
1322 static int ioread_ret(void *context, char *buffer, int len)
1324 struct recRetrieveCtrl *p = context;
1325 return p->stream->readf(p->stream, buffer, len);
1328 static int ioclose_ret(void *context)
1333 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1335 /* const char *esn = zebra_dom_ns; */
1336 const char *esn = 0;
1337 const char *params[32];
1338 struct filter_info *tinfo = clientData;
1340 struct filter_retrieve *retrieve;
1341 xsltStylesheetPtr last_xsp = 0;
1345 if (p->comp->which == Z_RecordComp_simple
1346 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1348 esn = p->comp->u.simple->u.generic;
1350 else if (p->comp->which == Z_RecordComp_complex
1351 && p->comp->u.complex->generic->elementSpec
1352 && p->comp->u.complex->generic->elementSpec->which ==
1353 Z_ElementSpec_elementSetName)
1355 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1358 retrieve = lookup_retrieve(tinfo, esn);
1362 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1367 set_param_int(params, "id", p->localno, p->odr);
1369 set_param_str(params, "filename", p->fname, p->odr);
1370 if (p->staticrank >= 0)
1371 set_param_int(params, "rank", p->staticrank, p->odr);
1374 set_param_str(params, "schema", esn, p->odr);
1377 set_param_str(params, "schema", retrieve->name, p->odr);
1378 else if (retrieve->identifier)
1379 set_param_str(params, "schema", retrieve->identifier, p->odr);
1381 set_param_str(params, "schema", "", p->odr);
1384 set_param_int(params, "score", p->score, p->odr);
1385 set_param_int(params, "size", p->recordSize, p->odr);
1387 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1390 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1393 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1397 /* retrieve conversion */
1398 perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
1401 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1403 else if (!p->input_format
1404 || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
1410 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1412 xmlDocDumpMemory(doc, &buf_out, &len_out);
1414 p->output_format = yaz_oid_recsyn_xml;
1415 p->rec_len = len_out;
1416 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1417 memcpy(p->rec_buf, buf_out, p->rec_len);
1420 else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
1426 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1428 xmlDocDumpMemory(doc, &buf_out, &len_out);
1430 p->output_format = yaz_oid_recsyn_sutrs;
1431 p->rec_len = len_out;
1432 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1433 memcpy(p->rec_buf, buf_out, p->rec_len);
1439 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1445 static struct recType filter_type = {
1456 #ifdef IDZEBRA_STATIC_DOM
1469 * indent-tabs-mode: nil
1471 * vim: shiftwidth=4 tabstop=8 expandtab