1 /* This file is part of the Zebra server.
2 Copyright (C) 1995-2008 Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
25 #include <yaz/diagbib1.h>
26 #include <yaz/tpath.h>
27 #include <yaz/snprintf.h>
29 #include <libxml/xmlversion.h>
30 #include <libxml/parser.h>
31 #include <libxml/tree.h>
32 #include <libxml/xmlIO.h>
33 #include <libxml/xmlreader.h>
34 #include <libxslt/transform.h>
35 #include <libxslt/xsltutils.h>
38 #include <libexslt/exslt.h>
41 #include <idzebra/util.h>
42 #include <idzebra/recctrl.h>
43 #include <yaz/oid_db.h>
45 /* DOM filter style indexing */
46 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
47 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
49 /* DOM filter style indexing */
50 #define ZEBRA_PI_NAME "zebra-2.0"
51 static const char *zebra_pi_name = ZEBRA_PI_NAME;
56 const char *stylesheet;
57 xsltStylesheetPtr stylesheet_xsp;
58 struct convert_s *next;
61 struct filter_extract {
63 struct convert_s *convert;
67 struct convert_s *convert;
70 struct filter_retrieve {
72 const char *identifier;
73 struct convert_s *convert;
74 struct filter_retrieve *next;
77 #define DOM_INPUT_XMLREADER 1
78 #define DOM_INPUT_MARC 2
82 struct convert_s *convert;
86 xmlTextReaderPtr reader;
90 const char *input_charset;
95 struct filter_input *next;
101 const char *profile_path;
104 xmlDocPtr doc_config;
105 struct filter_extract *extract;
106 struct filter_retrieve *retrieve_list;
107 struct filter_input *input_list;
108 struct filter_store *store;
109 int record_info_invoked;
114 #define XML_STRCMP(a,b) strcmp((char*)a, b)
115 #define XML_STRLEN(a) strlen((char*)a)
118 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
120 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
121 const char *fmt, ...)
123 __attribute__ ((format (printf, 4, 5)))
127 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
128 const char *fmt, ...)
134 yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
137 yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
138 xmlGetLineNo(ptr), buf);
142 yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
148 static void set_param_str(const char **params, const char *name,
149 const char *value, NMEM nmem)
151 char *quoted = nmem_malloc(nmem, 3 + strlen(value));
152 sprintf(quoted, "'%s'", value);
160 static void set_param_int(const char **params, const char *name,
161 zint value, NMEM nmem)
163 char *quoted = nmem_malloc(nmem, 30); /* 25 digits enough for 2^64 */
166 sprintf(quoted, "'" ZINT_FORMAT "'", value);
172 static void *filter_init(Res res, RecType recType)
174 struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
176 tinfo->full_name = 0;
177 tinfo->profile_path = 0;
178 tinfo->nmem_record = nmem_create();
179 tinfo->nmem_config = nmem_create();
181 tinfo->retrieve_list = 0;
182 tinfo->input_list = 0;
184 tinfo->doc_config = 0;
185 tinfo->record_info_invoked = 0;
194 static int attr_content(struct _xmlAttr *attr, const char *name,
195 const char **dst_content)
197 if (!XML_STRCMP(attr->name, name) && attr->children
198 && attr->children->type == XML_TEXT_NODE)
200 *dst_content = (const char *)(attr->children->content);
206 static void destroy_xsp(struct convert_s *c)
210 if (c->stylesheet_xsp)
211 xsltFreeStylesheet(c->stylesheet_xsp);
216 static void destroy_dom(struct filter_info *tinfo)
220 destroy_xsp(tinfo->extract->convert);
225 destroy_xsp(tinfo->store->convert);
228 if (tinfo->input_list)
230 struct filter_input *i_ptr;
231 for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
235 case DOM_INPUT_XMLREADER:
236 if (i_ptr->u.xmlreader.reader)
237 xmlFreeTextReader(i_ptr->u.xmlreader.reader);
240 yaz_iconv_close(i_ptr->u.marc.iconv);
241 yaz_marc_destroy(i_ptr->u.marc.handle);
244 destroy_xsp(i_ptr->convert);
246 tinfo->input_list = 0;
248 if (tinfo->retrieve_list)
250 struct filter_retrieve *r_ptr;
251 for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
252 destroy_xsp(r_ptr->convert);
253 tinfo->retrieve_list = 0;
256 if (tinfo->doc_config)
258 xmlFreeDoc(tinfo->doc_config);
259 tinfo->doc_config = 0;
261 nmem_reset(tinfo->nmem_config);
264 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
265 struct convert_s **l)
268 FOR_EACH_ELEMENT(ptr) {
269 if (!XML_STRCMP(ptr->name, "xslt"))
271 struct _xmlAttr *attr;
273 = nmem_malloc(tinfo->nmem_config, sizeof(*p));
277 p->stylesheet_xsp = 0;
279 for (attr = ptr->properties; attr; attr = attr->next)
280 if (attr_content(attr, "stylesheet", &p->stylesheet))
284 dom_log(YLOG_WARN, tinfo, ptr,
285 "bad attribute @%s", attr->name);
289 char tmp_xslt_full_name[1024];
290 if (!yaz_filepath_resolve(p->stylesheet,
295 dom_log(YLOG_WARN, tinfo, 0,
296 "stylesheet %s not found in "
299 tinfo->profile_path);
304 = xsltParseStylesheetFile((const xmlChar*)
306 if (!p->stylesheet_xsp)
308 dom_log(YLOG_WARN, tinfo, 0,
309 "could not parse xslt stylesheet %s",
316 dom_log(YLOG_WARN, tinfo, ptr,
317 "missing attribute 'stylesheet' ");
325 dom_log(YLOG_WARN, tinfo, ptr,
326 "bad element '%s', expected <xslt>", ptr->name);
333 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
334 struct recExtractCtrl *extctr,
335 struct convert_s *convert,
338 xsltStylesheetPtr *last_xsp)
340 for (; convert; convert = convert->next)
342 xmlChar *buf_out = 0;
344 xmlDocPtr res_doc = xsltApplyStylesheet(convert->stylesheet_xsp,
347 *last_xsp = convert->stylesheet_xsp;
352 /* now saving into buffer and re-reading into DOM to avoid annoing
353 XSLT problem with thrown-out indentation text nodes */
354 xsltSaveResultToString(&buf_out, &len_out, res_doc,
355 convert->stylesheet_xsp);
360 *doc = xmlParseMemory((const char *) buf_out, len_out);
362 /* writing debug info out */
363 if (extctr && extctr->flagShowRecords)
364 yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
365 tinfo->fname ? tinfo->fname : "(none)",
374 static struct filter_input *new_input(struct filter_info *tinfo, int type)
376 struct filter_input *p;
377 struct filter_input **np = &tinfo->input_list;
378 for (;*np; np = &(*np)->next)
380 p = *np = nmem_malloc(tinfo->nmem_config, sizeof(*p));
389 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
390 const char *syntax, const char *name)
392 FOR_EACH_ELEMENT(ptr) {
393 if (!XML_STRCMP(ptr->name, "marc"))
395 yaz_iconv_t iconv = 0;
396 const char *input_charset = "marc-8";
397 struct _xmlAttr *attr;
399 for (attr = ptr->properties; attr; attr = attr->next)
401 if (attr_content(attr, "inputcharset", &input_charset))
405 dom_log(YLOG_WARN, tinfo, ptr,
406 "bad attribute @%s, expected @inputcharset",
410 iconv = yaz_iconv_open("utf-8", input_charset);
413 dom_log(YLOG_WARN, tinfo, ptr,
414 "unsupported @charset '%s'", input_charset);
419 struct filter_input *p
420 = new_input(tinfo, DOM_INPUT_MARC);
421 p->u.marc.handle = yaz_marc_create();
422 p->u.marc.iconv = iconv;
424 yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
428 parse_convert(tinfo, ptr, &p->convert);
433 else if (!XML_STRCMP(ptr->name, "xmlreader"))
435 struct filter_input *p
436 = new_input(tinfo, DOM_INPUT_XMLREADER);
437 struct _xmlAttr *attr;
438 const char *level_str = 0;
440 p->u.xmlreader.split_level = 0;
441 p->u.xmlreader.reader = 0;
443 for (attr = ptr->properties; attr; attr = attr->next)
445 if (attr_content(attr, "level", &level_str))
449 dom_log(YLOG_WARN, tinfo, ptr,
450 "bad attribute @%s, expected @level",
455 p->u.xmlreader.split_level = atoi(level_str);
459 parse_convert(tinfo, ptr, &p->convert);
464 dom_log(YLOG_WARN, tinfo, ptr,
465 "bad element <%s>, expected <marc>|<xmlreader>",
473 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
475 char tmp_full_name[1024];
479 tinfo->fname = nmem_strdup(tinfo->nmem_config, fname);
481 if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
482 NULL, tmp_full_name))
483 tinfo->full_name = nmem_strdup(tinfo->nmem_config, tmp_full_name);
485 tinfo->full_name = nmem_strdup(tinfo->nmem_config, tinfo->fname);
487 yaz_log(YLOG_LOG, "%s dom filter: "
488 "loading config file %s", tinfo->fname, tinfo->full_name);
490 doc = xmlParseFile(tinfo->full_name);
493 yaz_log(YLOG_WARN, "%s: dom filter: "
494 "failed to parse config file %s",
495 tinfo->fname, tinfo->full_name);
498 /* save because we store ptrs to the content */
499 tinfo->doc_config = doc;
501 ptr = xmlDocGetRootElement(doc);
502 if (!ptr || ptr->type != XML_ELEMENT_NODE
503 || XML_STRCMP(ptr->name, "dom"))
505 dom_log(YLOG_WARN, tinfo, ptr,
506 "bad root element <%s>, expected root element <dom>",
512 FOR_EACH_ELEMENT(ptr) {
513 if (!XML_STRCMP(ptr->name, "extract"))
516 <extract name="index">
517 <xslt stylesheet="first.xsl"/>
518 <xslt stylesheet="second.xsl"/>
521 struct _xmlAttr *attr;
522 struct filter_extract *f =
523 nmem_malloc(tinfo->nmem_config, sizeof(*f));
528 for (attr = ptr->properties; attr; attr = attr->next)
530 if (attr_content(attr, "name", &f->name))
534 dom_log(YLOG_WARN, tinfo, ptr,
535 "bad attribute @%s, expected @name",
539 parse_convert(tinfo, ptr->children, &f->convert);
541 else if (!XML_STRCMP(ptr->name, "retrieve"))
545 <xslt stylesheet="some.xsl"/>
546 <xslt stylesheet="some.xsl"/>
549 struct _xmlAttr *attr;
550 struct filter_retrieve **fp = &tinfo->retrieve_list;
551 struct filter_retrieve *f =
552 nmem_malloc(tinfo->nmem_config, sizeof(*f));
563 for (attr = ptr->properties; attr; attr = attr->next)
565 if (attr_content(attr, "identifier",
568 else if (attr_content(attr, "name", &f->name))
572 dom_log(YLOG_WARN, tinfo, ptr,
573 "bad attribute @%s, expected @identifier|@name",
577 parse_convert(tinfo, ptr->children, &f->convert);
579 else if (!XML_STRCMP(ptr->name, "store"))
583 <xslt stylesheet="some.xsl"/>
584 <xslt stylesheet="some.xsl"/>
587 struct filter_store *f =
588 nmem_malloc(tinfo->nmem_config, sizeof(*f));
592 parse_convert(tinfo, ptr->children, &f->convert);
594 else if (!XML_STRCMP(ptr->name, "input"))
598 <xmlreader level="1"/>
600 <input syntax="usmarc">
601 <marc inputcharset="marc-8"/>
604 struct _xmlAttr *attr;
605 const char *syntax = 0;
606 const char *name = 0;
607 for (attr = ptr->properties; attr; attr = attr->next)
609 if (attr_content(attr, "syntax", &syntax))
611 else if (attr_content(attr, "name", &name))
615 dom_log(YLOG_WARN, tinfo, ptr,
616 "bad attribute @%s, expected @syntax|@name",
620 parse_input(tinfo, ptr->children, syntax, name);
624 dom_log(YLOG_WARN, tinfo, ptr,
626 "expected <extract>|<input>|<retrieve>|<store>",
631 if (!tinfo->input_list)
633 struct filter_input *p
634 = new_input(tinfo, DOM_INPUT_XMLREADER);
635 p->u.xmlreader.split_level = 0;
636 p->u.xmlreader.reader = 0;
641 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
644 struct filter_retrieve *f = tinfo->retrieve_list;
646 /* return first schema if no est is provided */
649 for (; f; f = f->next)
651 /* find requested schema */
654 if (f->identifier && !strcmp(f->identifier, est))
656 if (f->name && !strcmp(f->name, est))
663 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
665 struct filter_info *tinfo = clientData;
668 yaz_log(YLOG_WARN, "dom filter: need config file");
672 if (tinfo->fname && !strcmp(args, tinfo->fname))
675 tinfo->profile_path = res_get(res, "profilePath");
678 return parse_dom(tinfo, args);
681 static void filter_destroy(void *clientData)
683 struct filter_info *tinfo = clientData;
685 nmem_destroy(tinfo->nmem_config);
686 nmem_destroy(tinfo->nmem_record);
690 static int ioread_ex(void *context, char *buffer, int len)
692 struct recExtractCtrl *p = context;
693 return p->stream->readf(p->stream, buffer, len);
696 static int ioclose_ex(void *context)
702 /* DOM filter style indexing */
703 static int attr_content_xml(struct _xmlAttr *attr, const char *name,
704 const char **dst_content)
706 if (0 == XML_STRCMP(attr->name, name) && attr->children
707 && attr->children->type == XML_TEXT_NODE)
709 *dst_content = (const char *) (attr->children->content);
716 /* DOM filter style indexing */
717 static void index_value_of(struct filter_info *tinfo,
718 struct recExtractCtrl *extctr,
723 if (tinfo->record_info_invoked == 1)
725 xmlChar *text = xmlNodeGetContent(node);
726 size_t text_len = strlen((const char *)text);
728 /* if there is no text, we do not need to proceed */
731 const char *look = index_p;
738 /* assingning text to be indexed */
739 recword->term_buf = (const char *)text;
740 recword->term_len = text_len;
742 /* parsing all index name/type pairs */
743 /* may not start with ' ' or ':' */
744 while (*look && ' ' != *look && ':' != *look)
746 /* setting name and type to zero */
750 /* parsing one index name */
752 while (*look && ':' != *look && ' ' != *look)
757 strncpy((char *)index, (const char *)bval, eval - bval);
758 index[eval - bval] = '\0';
761 /* parsing one index type, if existing */
767 while (*look && ' ' != *look)
772 strncpy((char *)type, (const char *)bval, eval - bval);
773 type[eval - bval] = '\0';
776 /* actually indexing the text given */
778 recword->index_name = (const char *)index;
780 recword->index_type = (const char *) type;
782 /* writing debug out */
783 if (extctr->flagShowRecords)
784 dom_log(YLOG_LOG, tinfo, 0,
785 "INDEX '%s:%s' '%s'",
786 (const char *) index,
788 (const char *) text);
790 (extctr->tokenAdd)(recword);
792 /* eat whitespaces */
793 if (*look && ' ' == *look)
804 /* DOM filter style indexing */
805 static void set_record_info(struct filter_info *tinfo,
806 struct recExtractCtrl *extctr,
812 /* writing debug info out */
813 if (extctr && extctr->flagShowRecords)
814 dom_log(YLOG_LOG, tinfo, node,
815 "RECORD id=%s rank=%s type=%s",
816 id_p ? (const char *) id_p : "(null)",
817 rank_p ? (const char *) rank_p : "(null)",
818 type_p ? (const char *) type_p : "(null)");
822 sscanf((const char *)id_p, "%255s", extctr->match_criteria);
824 if (rank_p && *rank_p)
825 extctr->staticrank = atozint((const char *)rank_p);
827 if (type_p && *type_p)
829 enum zebra_recctrl_action_t action = action_update;
830 if (!strcmp(type_p, "insert"))
831 action = action_insert;
832 else if (!strcmp(type_p, "delete"))
833 action = action_delete;
834 else if (!strcmp(type_p, "replace"))
835 action = action_replace;
836 else if (!strcmp(type_p, "update"))
837 action = action_update;
839 dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
840 extctr->action = action;
843 if (tinfo->record_info_invoked == 1)
845 /* warn about multiple only once */
846 dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
848 tinfo->record_info_invoked++;
853 /* DOM filter style indexing */
854 static void process_xml_element_zebra_node(struct filter_info *tinfo,
855 struct recExtractCtrl *extctr,
859 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
860 && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
862 if (0 == XML_STRCMP(node->name, "index"))
864 const char *index_p = 0;
866 struct _xmlAttr *attr;
867 for (attr = node->properties; attr; attr = attr->next)
869 if (attr_content_xml(attr, "name", &index_p))
871 index_value_of(tinfo, extctr, recword, node, index_p);
875 dom_log(YLOG_WARN, tinfo, node,
876 "bad attribute @%s, expected @name",
881 else if (0 == XML_STRCMP(node->name, "record"))
883 const char *id_p = 0;
884 const char *rank_p = 0;
885 const char *type_p = 0;
887 struct _xmlAttr *attr;
888 for (attr = node->properties; attr; attr = attr->next)
890 if (attr_content_xml(attr, "id", &id_p))
892 else if (attr_content_xml(attr, "rank", &rank_p))
894 else if (attr_content_xml(attr, "type", &type_p))
898 dom_log(YLOG_WARN, tinfo, node,
899 "bad attribute @%s, expected @id|@rank|@type",
903 set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
907 dom_log(YLOG_WARN, tinfo, node,
909 " expected <record>|<index> in namespace '%s'",
910 node->name, zebra_dom_ns);
915 static int attr_content_pi(const char **c_ptr, const char *name,
916 char *value, size_t value_max)
918 size_t name_len = strlen(name);
919 const char *look = *c_ptr;
923 while (*look && ' ' == *look)
925 if (strlen(look) > name_len)
927 if (look[name_len] == '=' && !memcmp(look, name, name_len))
931 while (*look && ' ' != *look)
941 while (*look && ' ' == *look)
947 /* DOM filter style indexing */
948 static void process_xml_pi_node(struct filter_info *tinfo,
949 struct recExtractCtrl *extctr,
951 const char **index_pp)
953 /* if right PI name, continue parsing PI */
954 if (0 == strcmp(zebra_pi_name, (const char *)node->name))
956 xmlChar *pi_p = node->content;
957 const char *look = (const char *) node->content;
959 /* parsing PI record instructions */
960 if (0 == strncmp((const char *)look, "record", 6))
971 if (attr_content_pi(&look, "id", id, sizeof(id)))
973 else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
975 else if (attr_content_pi(&look, "type", type, sizeof(type)))
977 dom_log(YLOG_WARN, tinfo, node,
978 "content '%s', can not parse '%s'",
982 set_record_info(tinfo, extctr, node, id, rank, type);
984 /* parsing index instruction */
985 else if (0 == strncmp((const char *)look, "index", 5))
990 while (*look && ' ' == *look)
993 /* export index instructions to outside */
998 dom_log(YLOG_WARN, tinfo, node,
999 "content '%s', can not parse '%s'",
1005 /* DOM filter style indexing */
1006 static void process_xml_element_node(struct filter_info *tinfo,
1007 struct recExtractCtrl *extctr,
1011 /* remember indexing instruction from PI to next element node */
1012 const char *index_p = 0;
1014 /* check if we are an element node in the special zebra namespace
1015 and either set record data or index value-of node content*/
1016 process_xml_element_zebra_node(tinfo, extctr, recword, node);
1018 /* loop through kid nodes */
1019 for (node = node->children; node; node = node->next)
1021 /* check and set PI record and index index instructions */
1022 if (node->type == XML_PI_NODE)
1024 process_xml_pi_node(tinfo, extctr, node, &index_p);
1026 else if (node->type == XML_ELEMENT_NODE)
1028 /* if there was a PI index instruction before this element */
1031 index_value_of(tinfo, extctr, recword, node, index_p);
1034 process_xml_element_node(tinfo, extctr, recword,node);
1042 /* DOM filter style indexing */
1043 static void extract_dom_doc_node(struct filter_info *tinfo,
1044 struct recExtractCtrl *extctr,
1047 /* only need to do the initialization once, reuse recword for all terms */
1049 (*extctr->init)(extctr, &recword);
1051 process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1057 static int convert_extract_doc(struct filter_info *tinfo,
1058 struct filter_input *input,
1059 struct recExtractCtrl *p,
1065 const char *params[10];
1066 xsltStylesheetPtr last_xsp = 0;
1067 xmlDocPtr store_doc = 0;
1069 /* per default do not ingest record */
1070 tinfo->record_info_invoked = 0;
1072 /* exit if empty document given */
1074 return RECCTRL_EXTRACT_SKIP;
1076 /* we actuallu have a document which needs to be processed further */
1078 set_param_str(params, "schema", zebra_dom_ns, tinfo->nmem_record);
1080 if (p && p->flagShowRecords)
1085 FILE *outf = fopen("extract.xml", "w");
1086 xmlDocDumpMemory(doc, &buf_out, &len_out);
1087 fwrite(buf_out, 1, len_out, outf);
1089 yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
1095 /* input conversion */
1096 perform_convert(tinfo, p, input->convert, params, &doc, 0);
1101 /* store conversion */
1102 store_doc = xmlCopyDoc(doc, 1);
1103 perform_convert(tinfo, p, tinfo->store->convert,
1104 params, &store_doc, &last_xsp);
1107 /* saving either store doc or original doc in case no store doc exists */
1109 xsltSaveResultToString(&buf_out, &len_out,
1110 store_doc ? store_doc : doc, last_xsp);
1112 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1114 if (p->setStoreData)
1115 (*p->setStoreData)(p, buf_out, len_out);
1119 xmlFreeDoc(store_doc);
1121 /* extract conversion */
1122 perform_convert(tinfo, p, tinfo->extract->convert, params, &doc, 0);
1125 /* finally, do the indexing */
1127 extract_dom_doc_node(tinfo, p, doc);
1131 /* there was nothing to index, so there is no inserted/updated record */
1132 if (tinfo->record_info_invoked == 0)
1133 return RECCTRL_EXTRACT_SKIP;
1135 return RECCTRL_EXTRACT_OK;
1138 static int extract_xml_split(struct filter_info *tinfo,
1139 struct filter_input *input,
1140 struct recExtractCtrl *p)
1144 if (p->first_record)
1146 if (input->u.xmlreader.reader)
1147 xmlFreeTextReader(input->u.xmlreader.reader);
1148 input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1149 p /* I/O handler */,
1156 if (!input->u.xmlreader.reader)
1157 return RECCTRL_EXTRACT_ERROR_GENERIC;
1159 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1162 int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1163 int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1165 if (type == XML_READER_TYPE_ELEMENT &&
1166 input->u.xmlreader.split_level == depth)
1170 /* per default do not ingest record */
1171 tinfo->record_info_invoked = 0;
1173 ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1176 /* we have a new document */
1178 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1179 xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1181 xmlDocSetRootElement(doc, ptr2);
1183 /* writing debug info out */
1184 if (p->flagShowRecords)
1186 xmlChar *buf_out = 0;
1188 xmlDocDumpMemory(doc, &buf_out, &len_out);
1189 yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1190 tinfo->fname ? tinfo->fname : "(none)",
1191 depth, len_out, buf_out);
1195 return convert_extract_doc(tinfo, input, p, doc);
1199 xmlFreeTextReader(input->u.xmlreader.reader);
1200 input->u.xmlreader.reader = 0;
1201 return RECCTRL_EXTRACT_ERROR_GENERIC;
1204 ret = xmlTextReaderRead(input->u.xmlreader.reader);
1206 xmlFreeTextReader(input->u.xmlreader.reader);
1207 input->u.xmlreader.reader = 0;
1208 return RECCTRL_EXTRACT_EOF;
1211 static int extract_xml_full(struct filter_info *tinfo,
1212 struct filter_input *input,
1213 struct recExtractCtrl *p)
1215 if (p->first_record) /* only one record per stream */
1217 xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1218 p /* I/O handler */,
1226 return RECCTRL_EXTRACT_ERROR_GENERIC;
1228 return convert_extract_doc(tinfo, input, p, doc);
1231 return RECCTRL_EXTRACT_EOF;
1234 static int extract_iso2709(struct filter_info *tinfo,
1235 struct filter_input *input,
1236 struct recExtractCtrl *p)
1242 if (p->stream->readf(p->stream, buf, 5) != 5)
1243 return RECCTRL_EXTRACT_EOF;
1244 while (*buf < '0' || *buf > '9')
1248 dom_log(YLOG_WARN, tinfo, 0,
1249 "MARC: Skipping bad byte %d (0x%02X)",
1250 *buf & 0xff, *buf & 0xff);
1251 for (i = 0; i<4; i++)
1254 if (p->stream->readf(p->stream, buf+4, 1) != 1)
1255 return RECCTRL_EXTRACT_EOF;
1257 record_length = atoi_n (buf, 5);
1258 if (record_length < 25)
1260 dom_log(YLOG_WARN, tinfo, 0,
1261 "MARC record length < 25, is %d", record_length);
1262 return RECCTRL_EXTRACT_ERROR_GENERIC;
1264 read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1265 if (read_bytes < record_length-5)
1267 dom_log(YLOG_WARN, tinfo, 0,
1268 "couldn't read whole MARC record");
1269 return RECCTRL_EXTRACT_ERROR_GENERIC;
1271 r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1272 if (r < record_length)
1274 dom_log (YLOG_WARN, tinfo, 0,
1275 "parsing of MARC record failed r=%d length=%d",
1277 return RECCTRL_EXTRACT_ERROR_GENERIC;
1283 yaz_marc_write_xml(input->u.marc.handle, &root_ptr,
1284 "http://www.loc.gov/MARC21/slim", 0, 0);
1285 rdoc = xmlNewDoc((const xmlChar*) "1.0");
1286 xmlDocSetRootElement(rdoc, root_ptr);
1287 return convert_extract_doc(tinfo, input, p, rdoc);
1289 return RECCTRL_EXTRACT_OK;
1292 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1294 struct filter_info *tinfo = clientData;
1295 struct filter_input *input = tinfo->input_list;
1298 return RECCTRL_EXTRACT_ERROR_GENERIC;
1300 nmem_reset(tinfo->nmem_record);
1302 if (p->setStoreData == 0)
1303 return extract_xml_full(tinfo, input, p);
1306 case DOM_INPUT_XMLREADER:
1307 if (input->u.xmlreader.split_level == 0)
1308 return extract_xml_full(tinfo, input, p);
1310 return extract_xml_split(tinfo, input, p);
1312 case DOM_INPUT_MARC:
1313 return extract_iso2709(tinfo, input, p);
1315 return RECCTRL_EXTRACT_ERROR_GENERIC;
1318 static int ioread_ret(void *context, char *buffer, int len)
1320 struct recRetrieveCtrl *p = context;
1321 return p->stream->readf(p->stream, buffer, len);
1324 static int ioclose_ret(void *context)
1329 static int filter_retrieve (void *clientData, struct recRetrieveCtrl *p)
1331 /* const char *esn = zebra_dom_ns; */
1332 const char *esn = 0;
1333 const char *params[32];
1334 struct filter_info *tinfo = clientData;
1336 struct filter_retrieve *retrieve;
1337 xsltStylesheetPtr last_xsp = 0;
1341 if (p->comp->which == Z_RecordComp_simple
1342 && p->comp->u.simple->which == Z_ElementSetNames_generic)
1344 esn = p->comp->u.simple->u.generic;
1346 else if (p->comp->which == Z_RecordComp_complex
1347 && p->comp->u.complex->generic->elementSpec
1348 && p->comp->u.complex->generic->elementSpec->which ==
1349 Z_ElementSpec_elementSetName)
1351 esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1354 retrieve = lookup_retrieve(tinfo, esn);
1358 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1359 p->addinfo = odr_strdup(p->odr, esn);
1364 set_param_int(params, "id", p->localno, p->odr->mem);
1366 set_param_str(params, "filename", p->fname, p->odr->mem);
1367 if (p->staticrank >= 0)
1368 set_param_int(params, "rank", p->staticrank, p->odr->mem);
1371 set_param_str(params, "schema", esn, p->odr->mem);
1374 set_param_str(params, "schema", retrieve->name, p->odr->mem);
1375 else if (retrieve->identifier)
1376 set_param_str(params, "schema", retrieve->identifier, p->odr->mem);
1378 set_param_str(params, "schema", "", p->odr->mem);
1381 set_param_int(params, "score", p->score, p->odr->mem);
1382 set_param_int(params, "size", p->recordSize, p->odr->mem);
1384 doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1387 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1390 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1394 /* retrieve conversion */
1395 perform_convert(tinfo, 0, retrieve->convert, params, &doc, &last_xsp);
1398 p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1400 else if (!p->input_format
1401 || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
1407 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1409 xmlDocDumpMemory(doc, &buf_out, &len_out);
1411 p->output_format = yaz_oid_recsyn_xml;
1412 p->rec_len = len_out;
1413 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1414 memcpy(p->rec_buf, buf_out, p->rec_len);
1417 else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
1423 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1425 xmlDocDumpMemory(doc, &buf_out, &len_out);
1427 p->output_format = yaz_oid_recsyn_sutrs;
1428 p->rec_len = len_out;
1429 p->rec_buf = odr_malloc(p->odr, p->rec_len);
1430 memcpy(p->rec_buf, buf_out, p->rec_len);
1436 p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1442 static struct recType filter_type = {
1453 #ifdef IDZEBRA_STATIC_DOM
1466 * indent-tabs-mode: nil
1468 * vim: shiftwidth=4 tabstop=8 expandtab