1 /* $Id: mod_dom.c,v 1.34 2007-04-07 22:18:46 adam Exp $
2 Copyright (C) 1995-2007
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 #include <yaz/diagbib1.h>
29 #include <yaz/tpath.h>
30 #include <yaz/snprintf.h>
32 #include <libxml/xmlversion.h>
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/xmlreader.h>
37 #include <libxslt/transform.h>
38 #include <libxslt/xsltutils.h>
41 #include <libexslt/exslt.h>
44 #include <idzebra/util.h>
45 #include <idzebra/recctrl.h>
47 /* DOM filter style indexing */
48 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
49 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
51 /* DOM filter style indexing */
52 #define ZEBRA_PI_NAME "zebra-2.0"
53 static const char *zebra_pi_name = ZEBRA_PI_NAME;
58 const char *stylesheet;
59 xsltStylesheetPtr stylesheet_xsp;
60 struct convert_s *next;
63 struct filter_extract {
65 struct convert_s *convert;
69 struct convert_s *convert;
72 struct filter_retrieve {
74 const char *identifier;
75 struct convert_s *convert;
76 struct filter_retrieve *next;
79 #define DOM_INPUT_XMLREADER 1
80 #define DOM_INPUT_MARC 2
84 struct convert_s *convert;
88 xmlTextReaderPtr reader;
92 const char *input_charset;
97 struct filter_input *next;
103 const char *profile_path;
106 xmlDocPtr doc_config;
107 struct filter_extract *extract;
108 struct filter_retrieve *retrieve_list;
109 struct filter_input *input_list;
110 struct filter_store *store;
111 int record_info_invoked;
116 #define XML_STRCMP(a,b) strcmp((char*)a, b)
117 #define XML_STRLEN(a) strlen((char*)a)
120 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
122 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
123 const char *fmt, ...)
125 __attribute__ ((format (printf, 4, 5)))
129 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
130 const char *fmt, ...)
136 yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
139 yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
140 xmlGetLineNo(ptr), buf);
144 yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
150 static void set_param_str(const char **params, const char *name,
151 const char *value, ODR odr)
153 char *quoted = odr_malloc(odr, 3 + strlen(value));
154 sprintf(quoted, "'%s'", value);
162 static void set_param_int(const char **params, const char *name,
165 char *quoted = odr_malloc(odr, 30); /* 25 digits enough for 2^64 */
168 sprintf(quoted, "'" ZINT_FORMAT "'", value);
174 static void *filter_init(Res res, RecType recType)
176 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
178 tinfo->full_name = 0;
179 tinfo->profile_path = 0;
180 tinfo->odr_record = odr_createmem(ODR_ENCODE);
181 tinfo->odr_config = odr_createmem(ODR_ENCODE);
183 tinfo->retrieve_list = 0;
184 tinfo->input_list = 0;
186 tinfo->doc_config = 0;
187 tinfo->record_info_invoked = 0;
196 static int attr_content(struct _xmlAttr *attr, const char *name,
197 const char **dst_content)
199 if (!XML_STRCMP(attr->name, name) && attr->children
200 && attr->children->type == XML_TEXT_NODE)
202 *dst_content = (const char *)(attr->children->content);
208 static void destroy_xsp(struct convert_s *c)
212 if (c->stylesheet_xsp)
213 xsltFreeStylesheet(c->stylesheet_xsp);
218 static void destroy_dom(struct filter_info *tinfo)
222 destroy_xsp(tinfo->extract->convert);
227 destroy_xsp(tinfo->store->convert);
230 if (tinfo->input_list)
232 struct filter_input *i_ptr;
233 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
237 case DOM_INPUT_XMLREADER:
238 if (i_ptr->u.xmlreader.reader)
239 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
242 yaz_iconv_close(i_ptr->u.marc.iconv);
243 yaz_marc_destroy(i_ptr->u.marc.handle);
246 destroy_xsp(i_ptr->convert);
248 tinfo->input_list = 0;
250 if (tinfo->retrieve_list)
252 struct filter_retrieve *r_ptr;
253 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
254 destroy_xsp(r_ptr->convert);
255 tinfo->retrieve_list = 0;
258 if (tinfo->doc_config)
260 xmlFreeDoc(tinfo->doc_config);
261 tinfo->doc_config = 0;
263 odr_reset(tinfo->odr_config);
266 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
267 struct convert_s **l)
270 FOR_EACH_ELEMENT(ptr) {
271 if (!XML_STRCMP(ptr->name, "xslt"))
273 struct _xmlAttr *attr;
275 = odr_malloc(tinfo->odr_config, sizeof(*p));
279 p->stylesheet_xsp = 0;
281 for (attr = ptr->properties; attr; attr = attr->next)
282 if (attr_content(attr, "stylesheet", &p->stylesheet))
286 dom_log(YLOG_WARN, tinfo, ptr,
287 "bad attribute @%s", attr->name);
291 char tmp_xslt_full_name[1024];
292 if (!yaz_filepath_resolve(p->stylesheet,
297 dom_log(YLOG_WARN, tinfo, 0,
298 "stylesheet %s not found in "
301 tinfo->profile_path);
306 = xsltParseStylesheetFile((const xmlChar*)
308 if (!p->stylesheet_xsp)
310 dom_log(YLOG_WARN, tinfo, 0,
311 "could not parse xslt stylesheet %s",
318 dom_log(YLOG_WARN, tinfo, ptr,
319 "missing attribute 'stylesheet' ");
327 dom_log(YLOG_WARN, tinfo, ptr,
328 "bad element '%s', expected <xslt>", ptr->name);
335 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
336 struct recExtractCtrl *extctr,
337 struct convert_s *convert,
340 xsltStylesheetPtr *last_xsp)
342 for (; convert; convert = convert->next)
344 xmlChar *buf_out = 0;
346 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
349 *last_xsp = convert->stylesheet_xsp;
354 /* now saving into buffer and re-reading into DOM to avoid annoing
355 XSLT problem with thrown-out indentation text nodes */
356 xsltSaveResultToString(&buf_out, &len_out, res_doc,
357 convert->stylesheet_xsp);
362 *doc = xmlParseMemory((const char *) buf_out, len_out);
364 /* writing debug info out */
365 if (extctr && extctr->flagShowRecords)
366 yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
367 tinfo->fname ? tinfo->fname : "(none)",
376 static struct filter_input *new_input(struct filter_info *tinfo, int type)
378 struct filter_input *p;
379 struct filter_input **np = &tinfo->input_list;
380 for (;*np; np = &(*np)->next)
382 p = *np = odr_malloc(tinfo->odr_config, sizeof(*p));
391 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
392 const char *syntax, const char *name)
394 FOR_EACH_ELEMENT(ptr) {
395 if (!XML_STRCMP(ptr->name, "marc"))
397 yaz_iconv_t iconv = 0;
398 const char *input_charset = "marc-8";
399 struct _xmlAttr *attr;
401 for (attr = ptr->properties; attr; attr = attr->next)
403 if (attr_content(attr, "inputcharset", &input_charset))
407 dom_log(YLOG_WARN, tinfo, ptr,
408 "bad attribute @%s, expected @inputcharset",
412 iconv = yaz_iconv_open("utf-8", input_charset);
415 dom_log(YLOG_WARN, tinfo, ptr,
416 "unsupported @charset '%s'", input_charset);
421 struct filter_input *p
422 = new_input(tinfo, DOM_INPUT_MARC);
423 p->u.marc.handle = yaz_marc_create();
424 p->u.marc.iconv = iconv;
426 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
430 parse_convert(tinfo, ptr, &p->convert);
435 else if (!XML_STRCMP(ptr->name, "xmlreader"))
437 struct filter_input *p
438 = new_input(tinfo, DOM_INPUT_XMLREADER);
439 struct _xmlAttr *attr;
440 const char *level_str = 0;
442 p->u.xmlreader.split_level = 0;
443 p->u.xmlreader.reader = 0;
445 for (attr = ptr->properties; attr; attr = attr->next)
447 if (attr_content(attr, "level", &level_str))
451 dom_log(YLOG_WARN, tinfo, ptr,
452 "bad attribute @%s, expected @level",
457 p->u.xmlreader.split_level = atoi(level_str);
461 parse_convert(tinfo, ptr, &p->convert);
466 dom_log(YLOG_WARN, tinfo, ptr,
467 "bad element <%s>, expected <marc>|<xmlreader>",
475 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
477 char tmp_full_name[1024];
481 tinfo->fname = odr_strdup(tinfo->odr_config, fname);
483 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
484 NULL, tmp_full_name))
485 tinfo->full_name = odr_strdup(tinfo->odr_config, tmp_full_name);
487 tinfo->full_name = odr_strdup(tinfo->odr_config, tinfo->fname);
489 yaz_log(YLOG_LOG, "%s dom filter: "
490 "loading config file %s", tinfo->fname, tinfo->full_name);
492 doc = xmlParseFile(tinfo->full_name);
495 yaz_log(YLOG_WARN, "%s: dom filter: "
496 "failed to parse config file %s",
497 tinfo->fname, tinfo->full_name);
500 /* save because we store ptrs to the content */
501 tinfo->doc_config = doc;
503 ptr = xmlDocGetRootElement(doc);
504 if (!ptr || ptr->type != XML_ELEMENT_NODE
505 || XML_STRCMP(ptr->name, "dom"))
507 dom_log(YLOG_WARN, tinfo, ptr,
508 "bad root element <%s>, expected root element <dom>",
514 FOR_EACH_ELEMENT(ptr) {
515 if (!XML_STRCMP(ptr->name, "extract"))
518 <extract name="index">
519 <xslt stylesheet="first.xsl"/>
520 <xslt stylesheet="second.xsl"/>
523 struct _xmlAttr *attr;
524 struct filter_extract *f =
525 odr_malloc(tinfo->odr_config, sizeof(*f));
530 for (attr = ptr->properties; attr; attr = attr->next)
532 if (attr_content(attr, "name", &f->name))
536 dom_log(YLOG_WARN, tinfo, ptr,
537 "bad attribute @%s, expected @name",
541 parse_convert(tinfo, ptr->children, &f->convert);
543 else if (!XML_STRCMP(ptr->name, "retrieve"))
547 <xslt stylesheet="some.xsl"/>
548 <xslt stylesheet="some.xsl"/>
551 struct _xmlAttr *attr;
552 struct filter_retrieve **fp = &tinfo->retrieve_list;
553 struct filter_retrieve *f =
554 odr_malloc(tinfo->odr_config, sizeof(*f));
565 for (attr = ptr->properties; attr; attr = attr->next)
567 if (attr_content(attr, "identifier",
570 else if (attr_content(attr, "name", &f->name))
574 dom_log(YLOG_WARN, tinfo, ptr,
575 "bad attribute @%s, expected @identifier|@name",
579 parse_convert(tinfo, ptr->children, &f->convert);
581 else if (!XML_STRCMP(ptr->name, "store"))
585 <xslt stylesheet="some.xsl"/>
586 <xslt stylesheet="some.xsl"/>
589 struct filter_store *f =
590 odr_malloc(tinfo->odr_config, sizeof(*f));
594 parse_convert(tinfo, ptr->children, &f->convert);
596 else if (!XML_STRCMP(ptr->name, "input"))
600 <xmlreader level="1"/>
602 <input syntax="usmarc">
603 <marc inputcharset="marc-8"/>
606 struct _xmlAttr *attr;
607 const char *syntax = 0;
608 const char *name = 0;
609 for (attr = ptr->properties; attr; attr = attr->next)
611 if (attr_content(attr, "syntax", &syntax))
613 else if (attr_content(attr, "name", &name))
617 dom_log(YLOG_WARN, tinfo, ptr,
618 "bad attribute @%s, expected @syntax|@name",
622 parse_input(tinfo, ptr->children, syntax, name);
626 dom_log(YLOG_WARN, tinfo, ptr,
628 "expected <extract>|<input>|<retrieve>|<store>",
633 if (!tinfo->input_list)
635 struct filter_input *p
636 = new_input(tinfo, DOM_INPUT_XMLREADER);
637 p->u.xmlreader.split_level = 0;
638 p->u.xmlreader.reader = 0;
643 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
646 struct filter_retrieve *f = tinfo->retrieve_list;
648 /* return first schema if no est is provided */
651 for (; f; f = f->next)
653 /* find requested schema */
656 if (f->identifier && !strcmp(f->identifier, est))
658 if (f->name && !strcmp(f->name, est))
665 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
667 struct filter_info *tinfo = clientData;
670 yaz_log(YLOG_WARN, "dom filter: need config file");
674 if (tinfo->fname && !strcmp(args, tinfo->fname))
677 tinfo->profile_path = res_get(res, "profilePath");
680 return parse_dom(tinfo, args);
683 static void filter_destroy(void *clientData)
685 struct filter_info *tinfo = clientData;
687 odr_destroy(tinfo->odr_config);
688 odr_destroy(tinfo->odr_record);
692 static int ioread_ex(void *context, char *buffer, int len)
694 struct recExtractCtrl *p = context;
695 return p->stream->readf(p->stream, buffer, len);
698 static int ioclose_ex(void *context)
704 /* DOM filter style indexing */
705 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
706 const char **dst_content)
708 if (0 == XML_STRCMP(attr->name, name) && attr->children
709 && attr->children->type == XML_TEXT_NODE)
711 *dst_content = (const char *) (attr->children->content);
718 /* DOM filter style indexing */
719 static void index_value_of(struct filter_info *tinfo,
720 struct recExtractCtrl *extctr,
725 if (tinfo->record_info_invoked == 1)
727 xmlChar *text = xmlNodeGetContent(node);
728 size_t text_len = strlen((const char *)text);
730 /* if there is no text, we do not need to proceed */
733 const char *look = index_p;
740 /* assingning text to be indexed */
741 recword->term_buf = (const char *)text;
742 recword->term_len = text_len;
744 /* parsing all index name/type pairs */
745 /* may not start with ' ' or ':' */
746 while (*look && ' ' != *look && ':' != *look)
748 /* setting name and type to zero */
752 /* parsing one index name */
754 while (*look && ':' != *look && ' ' != *look)
759 strncpy((char *)index, (const char *)bval, eval - bval);
760 index[eval - bval] = '\0';
763 /* parsing one index type, if existing */
769 while (*look && ' ' != *look)
774 strncpy((char *)type, (const char *)bval, eval - bval);
775 type[eval - bval] = '\0';
778 /* actually indexing the text given */
779 dom_log(YLOG_DEBUG, tinfo, 0,
780 "INDEX '%s:%s' '%s'",
781 index ? (const char *) index : "null",
782 type ? (const char *) type : "null",
783 text ? (const char *) text : "null");
785 recword->index_name = (const char *)index;
787 recword->index_type = *type;
789 /* writing debug out */
790 if (extctr->flagShowRecords)
791 dom_log(YLOG_LOG, tinfo, 0,
792 "INDEX '%s:%s' '%s'",
793 index ? (const char *) index : "null",
794 type ? (const char *) type : "null",
795 text ? (const char *) text : "null");
797 /* actually indexing the text given */
798 recword->index_name = (const char *)index;
800 recword->index_type = *type;
801 (extctr->tokenAdd)(recword);
803 /* eat whitespaces */
804 if (*look && ' ' == *look)
815 /* DOM filter style indexing */
816 static void set_record_info(struct filter_info *tinfo,
817 struct recExtractCtrl *extctr,
823 /* writing debug info out */
824 if (extctr && extctr->flagShowRecords)
825 dom_log(YLOG_LOG, tinfo, node,
826 "RECORD id=%s rank=%s type=%s",
827 id_p ? (const char *) id_p : "(null)",
828 rank_p ? (const char *) rank_p : "(null)",
829 type_p ? (const char *) type_p : "(null)");
833 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
835 if (rank_p && *rank_p)
836 extctr->staticrank = atozint((const char *)rank_p);
838 if (type_p && *type_p)
840 enum zebra_recctrl_action_t action = action_update;
841 if (!strcmp(type_p, "insert"))
842 action = action_insert;
843 else if (!strcmp(type_p, "delete"))
844 action = action_delete;
845 else if (!strcmp(type_p, "replace"))
846 action = action_replace;
847 else if (!strcmp(type_p, "update"))
848 action = action_update;
850 dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
851 extctr->action = action;
852 yaz_log(YLOG_LOG, "In mod_dom.c: setting action to %d", action);
855 if (tinfo->record_info_invoked == 1)
857 /* warn about multiple only once */
858 dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
860 tinfo->record_info_invoked++;
865 /* DOM filter style indexing */
866 static void process_xml_element_zebra_node(struct filter_info *tinfo,
867 struct recExtractCtrl *extctr,
871 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
872 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
874 if (0 == XML_STRCMP(node->name, "index"))
876 const char *index_p = 0;
878 struct _xmlAttr *attr;
879 for (attr = node->properties; attr; attr = attr->next)
881 if (attr_content_xml(attr, "name", &index_p))
883 index_value_of(tinfo, extctr, recword, node, index_p);
887 dom_log(YLOG_WARN, tinfo, node,
888 "bad attribute @%s, expected @name",
893 else if (0 == XML_STRCMP(node->name, "record"))
895 const char *id_p = 0;
896 const char *rank_p = 0;
897 const char *type_p = 0;
899 struct _xmlAttr *attr;
900 for (attr = node->properties; attr; attr = attr->next)
902 if (attr_content_xml(attr, "id", &id_p))
904 else if (attr_content_xml(attr, "rank", &rank_p))
906 else if (attr_content_xml(attr, "type", &type_p))
910 dom_log(YLOG_WARN, tinfo, node,
911 "bad attribute @%s, expected @id|@rank|@type",
915 set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
919 dom_log(YLOG_WARN, tinfo, node,
921 " expected <record>|<index> in namespace '%s'",
922 node->name, zebra_dom_ns);
927 static int attr_content_pi(const char **c_ptr, const char *name,
928 char *value, size_t value_max)
930 size_t name_len = strlen(name);
931 const char *look = *c_ptr;
935 while (*look && ' ' == *look)
937 if (strlen(look) > name_len)
939 if (look[name_len] == '=' && !memcmp(look, name, name_len))
943 while (*look && ' ' != *look)
953 while (*look && ' ' == *look)
959 /* DOM filter style indexing */
960 static void process_xml_pi_node(struct filter_info *tinfo,
961 struct recExtractCtrl *extctr,
963 const char **index_pp)
965 /* if right PI name, continue parsing PI */
966 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
968 xmlChar *pi_p = node->content;
969 const char *look = (const char *) node->content;
971 /* parsing PI record instructions */
972 if (0 == strncmp((const char *)look, "record", 6))
983 if (attr_content_pi(&look, "id", id, sizeof(id)))
985 else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
987 else if (attr_content_pi(&look, "type", type, sizeof(type)))
989 dom_log(YLOG_WARN, tinfo, node,
990 "content '%s', can not parse '%s'",
994 set_record_info(tinfo, extctr, node, id, rank, type);
996 /* parsing index instruction */
997 else if (0 == strncmp((const char *)look, "index", 5))
1001 /* eat whitespace */
1002 while (*look && ' ' == *look)
1005 /* export index instructions to outside */
1010 dom_log(YLOG_WARN, tinfo, node,
1011 "content '%s', can not parse '%s'",
1017 /* DOM filter style indexing */
1018 static void process_xml_element_node(struct filter_info *tinfo,
1019 struct recExtractCtrl *extctr,
1023 /* remember indexing instruction from PI to next element node */
1024 const char *index_p = 0;
1026 /* check if we are an element node in the special zebra namespace
1027 and either set record data or index value-of node content*/
1028 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1030 /* loop through kid nodes */
1031 for (node = node->children; node; node = node->next)
1033 /* check and set PI record and index index instructions */
1034 if (node->type == XML_PI_NODE)
1036 process_xml_pi_node(tinfo, extctr, node, &index_p);
1038 else if (node->type == XML_ELEMENT_NODE)
1040 /* if there was a PI index instruction before this element */
1043 index_value_of(tinfo, extctr, recword, node, index_p);
1046 process_xml_element_node(tinfo, extctr, recword,node);
1054 /* DOM filter style indexing */
1055 static void extract_dom_doc_node(struct filter_info *tinfo,
1056 struct recExtractCtrl *extctr,
1059 /* only need to do the initialization once, reuse recword for all terms */
1061 (*extctr->init)(extctr, &recword);
1063 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1069 static int convert_extract_doc(struct filter_info *tinfo,
1070 struct filter_input *input,
1071 struct recExtractCtrl *p,
1077 const char *params[10];
1078 xsltStylesheetPtr last_xsp = 0;
1079 xmlDocPtr store_doc = 0;
1081 /* per default do not ingest record */
1082 tinfo->record_info_invoked = 0;
1084 /* exit if empty document given */
1086 return RECCTRL_EXTRACT_SKIP;
1088 /* we actuallu have a document which needs to be processed further */
1090 set_param_str(params, "schema", zebra_dom_ns, tinfo->odr_record);
1092 /* input conversion */
1093 perform_convert(tinfo, p, input->convert, params, &doc, 0);
1097 /* store conversion */
1098 store_doc = xmlCopyDoc(doc, 1);
1099 perform_convert(tinfo, p, tinfo->store->convert,
1100 params, &store_doc, &last_xsp);
1103 /* saving either store doc or original doc in case no store doc exists */
1105 xsltSaveResultToString(&buf_out, &len_out,
1106 store_doc ? store_doc : doc, last_xsp);
1108 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1110 (*p->setStoreData)(p, buf_out, len_out);
1114 xmlFreeDoc(store_doc);
1116 /* extract conversion */
1117 perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
1120 /* finally, do the indexing */
1122 extract_dom_doc_node(tinfo, p, doc);
1126 /* there was nothing to index, so there is no inserted/updated record */
1127 if (tinfo->record_info_invoked == 0)
1128 return RECCTRL_EXTRACT_SKIP;
1130 return RECCTRL_EXTRACT_OK;
1133 static int extract_xml_split(struct filter_info *tinfo,
1134 struct filter_input *input,
1135 struct recExtractCtrl *p)
1139 if (p->first_record)
1141 if (input->u.xmlreader.reader)
1142 xmlFreeTextReader(input->u.xmlreader.reader);
1143 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1144 p /* I/O handler */,
1151 if (!input->u.xmlreader.reader)
1152 return RECCTRL_EXTRACT_ERROR_GENERIC;
1154 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1157 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1158 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1160 if (type == XML_READER_TYPE_ELEMENT &&
1161 input->u.xmlreader.split_level == depth)
1165 /* per default do not ingest record */
1166 tinfo->record_info_invoked = 0;
1168 ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1171 /* we have a new document */
1173 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1174 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1176 xmlDocSetRootElement(doc, ptr2);
1178 /* writing debug info out */
1179 if (p->flagShowRecords)
1181 xmlChar *buf_out = 0;
1183 xmlDocDumpMemory(doc, &buf_out, &len_out);
1184 yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1185 tinfo->fname ? tinfo->fname : "(none)",
1186 depth, len_out, buf_out);
1190 return convert_extract_doc(tinfo, input, p, doc);
1194 xmlFreeTextReader(input->u.xmlreader.reader);
1195 input->u.xmlreader.reader = 0;
1196 return RECCTRL_EXTRACT_ERROR_GENERIC;
1199 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1201 xmlFreeTextReader(input->u.xmlreader.reader);
1202 input->u.xmlreader.reader = 0;
1203 return RECCTRL_EXTRACT_EOF;
1206 static int extract_xml_full(struct filter_info *tinfo,
1207 struct filter_input *input,
1208 struct recExtractCtrl *p)
1210 if (p->first_record) /* only one record per stream */
1212 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1213 p /* I/O handler */,
1221 return RECCTRL_EXTRACT_ERROR_GENERIC;
1223 return convert_extract_doc(tinfo, input, p, doc);
1226 return RECCTRL_EXTRACT_EOF;
1229 static int extract_iso2709(struct filter_info *tinfo,
1230 struct filter_input *input,
1231 struct recExtractCtrl *p)
1237 if (p->stream->readf(p->stream, buf, 5) != 5)
1238 return RECCTRL_EXTRACT_EOF;
1239 while (*buf < '0' || *buf > '9')
1243 dom_log(YLOG_WARN, tinfo, 0,
1244 "MARC: Skipping bad byte %d (0x%02X)",
1245 *buf & 0xff, *buf & 0xff);
1246 for (i = 0; i<4; i++)
1249 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1250 return RECCTRL_EXTRACT_EOF;
1252 record_length = atoi_n (buf, 5);
1253 if (record_length < 25)
1255 dom_log(YLOG_WARN, tinfo, 0,
1256 "MARC record length < 25, is %d", record_length);
1257 return RECCTRL_EXTRACT_ERROR_GENERIC;
1259 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1260 if (read_bytes < record_length-5)
1262 dom_log(YLOG_WARN, tinfo, 0,
1263 "couldn't read whole MARC record");
1264 return RECCTRL_EXTRACT_ERROR_GENERIC;
1266 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1267 if (r < record_length)
1269 dom_log (YLOG_WARN, tinfo, 0,
1270 "parsing of MARC record failed r=%d length=%d",
1272 return RECCTRL_EXTRACT_ERROR_GENERIC;
1278 yaz_marc_write_xml(input->u.marc.handle, &root_ptr, 0, 0, 0);
1279 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1280 xmlDocSetRootElement(rdoc, root_ptr);
1281 return convert_extract_doc(tinfo, input, p, rdoc);
1283 return RECCTRL_EXTRACT_OK;
1286 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1288 struct filter_info *tinfo = clientData;
1289 struct filter_input *input = tinfo->input_list;
1292 return RECCTRL_EXTRACT_ERROR_GENERIC;
1294 odr_reset(tinfo->odr_record);
1297 case DOM_INPUT_XMLREADER:
1298 if (input->u.xmlreader.split_level == 0)
1299 return extract_xml_full(tinfo, input, p);
1301 return extract_xml_split(tinfo, input, p);
1303 case DOM_INPUT_MARC:
1304 return extract_iso2709(tinfo, input, p);
1306 return RECCTRL_EXTRACT_ERROR_GENERIC;
1309 static int ioread_ret(void *context, char *buffer, int len)
1311 struct recRetrieveCtrl *p = context;
1312 return p->stream->readf(p->stream, buffer, len);
1315 static int ioclose_ret(void *context)
1320 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1322 /* const char *esn = zebra_dom_ns; */
1323 const char *esn = 0;
1324 const char *params[32];
1325 struct filter_info *tinfo = clientData;
1327 struct filter_retrieve *retrieve;
1328 xsltStylesheetPtr last_xsp = 0;
1332 if (p->comp->which == Z_RecordComp_simple
1333 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1335 esn = p->comp->u.simple->u.generic;
1337 else if (p->comp->which == Z_RecordComp_complex
1338 && p->comp->u.complex->generic->elementSpec
1339 && p->comp->u.complex->generic->elementSpec->which ==
1340 Z_ElementSpec_elementSetName)
1342 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1345 retrieve = lookup_retrieve(tinfo, esn);
1349 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1354 set_param_int(params, "id", p->localno, p->odr);
1356 set_param_str(params, "filename", p->fname, p->odr);
1357 if (p->staticrank >= 0)
1358 set_param_int(params, "rank", p->staticrank, p->odr);
1361 set_param_str(params, "schema", esn, p->odr);
1364 set_param_str(params, "schema", retrieve->name, p->odr);
1365 else if (retrieve->identifier)
1366 set_param_str(params, "schema", retrieve->identifier, p->odr);
1368 set_param_str(params, "schema", "", p->odr);
1371 set_param_int(params, "score", p->score, p->odr);
1372 set_param_int(params, "size", p->recordSize, p->odr);
1374 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1377 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1380 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1384 /* retrieve conversion */
1385 perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
1388 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1390 else if (p->input_format == VAL_NONE || p->input_format == VAL_TEXT_XML)
1396 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1398 xmlDocDumpMemory(doc, &buf_out, &len_out);
1400 p->output_format = VAL_TEXT_XML;
1401 p->rec_len = len_out;
1402 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1403 memcpy(p->rec_buf, buf_out, p->rec_len);
1406 else if (p->output_format == VAL_SUTRS)
1412 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1414 xmlDocDumpMemory(doc, &buf_out, &len_out);
1416 p->output_format = VAL_SUTRS;
1417 p->rec_len = len_out;
1418 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1419 memcpy(p->rec_buf, buf_out, p->rec_len);
1425 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1431 static struct recType filter_type = {
1442 #ifdef IDZEBRA_STATIC_DOM
1455 * indent-tabs-mode: nil
1457 * vim: shiftwidth=4 tabstop=8 expandtab