1 /* This file is part of the Zebra server.
2 Copyright (C) 1994-2010 Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 \brief indexes records and extract tokens for indexing and sorting
40 #include <yaz/snprintf.h>
42 static int log_level_extract = 0;
43 static int log_level_details = 0;
44 static int log_level_initialized = 0;
46 /* 1 if we use eliminitate identical delete/insert keys */
47 /* eventually this the 0-case code will be removed */
50 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
51 zebra_rec_keys_t ins_keys,
53 zebra_rec_keys_t del_keys,
56 static void zebra_init_log_level(void)
58 if (!log_level_initialized)
60 log_level_initialized = 1;
62 log_level_extract = yaz_log_module_level("extract");
63 log_level_details = yaz_log_module_level("indexdetails");
67 static WRBUF wrbuf_hex_str(const char *cstr)
70 WRBUF w = wrbuf_alloc();
71 for (i = 0; cstr[i]; i++)
73 if (cstr[i] < ' ' || cstr[i] > 126)
74 wrbuf_printf(w, "\\%02X", cstr[i] & 0xff);
76 wrbuf_putc(w, cstr[i]);
82 static void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
83 int cmd, zebra_rec_keys_t skp);
84 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid);
85 static void extract_token_add(RecWord *p);
87 static void check_log_limit(ZebraHandle zh)
89 if (zh->records_processed + zh->records_skipped == zh->m_file_verbose_limit)
91 yaz_log(YLOG_LOG, "More than %d file log entries. Omitting rest",
92 zh->m_file_verbose_limit);
96 static void logRecord(ZebraHandle zh)
99 ++zh->records_processed;
100 if (!(zh->records_processed % 1000))
102 yaz_log(YLOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
103 ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
104 zh->records_processed, zh->records_inserted,
105 zh->records_updated, zh->records_deleted);
109 static void init_extractCtrl(ZebraHandle zh, struct recExtractCtrl *ctrl)
111 ctrl->flagShowRecords = !zh->m_flag_rw;
115 static void extract_add_index_string(RecWord *p,
116 zinfo_index_category_t cat,
117 const char *str, int length);
119 static void extract_set_store_data_prepare(struct recExtractCtrl *p);
121 static void extract_init(struct recExtractCtrl *p, RecWord *w)
124 w->index_name = "any";
132 struct snip_rec_info {
134 zebra_snippets *snippets;
138 static void snippet_add_complete_field(RecWord *p, int ord,
141 struct snip_rec_info *h = p->extractCtrl->handle;
143 const char *b = p->term_buf;
144 char buf[IT_MAX_WORD+1];
145 const char **map = 0;
146 int i = 0, remain = p->term_len;
147 const char *start = b;
148 const char *last = 0;
151 map = zebra_maps_input(zm, &b, remain, 1);
153 while (remain > 0 && i < IT_MAX_WORD)
155 while (map && *map && **map == *CHR_SPACE)
157 remain = p->term_len - (b - p->term_buf);
160 start = b; /* set to first non-ws area */
163 int first = i ? 0 : 1; /* first position */
165 map = zebra_maps_input(zm, &b, remain, first);
173 if (i && i < IT_MAX_WORD)
174 buf[i++] = *CHR_SPACE;
175 while (map && *map && **map != *CHR_SPACE)
177 const char *cp = *map;
179 if (**map == *CHR_CUT)
185 if (i >= IT_MAX_WORD)
187 while (i < IT_MAX_WORD && *cp)
191 remain = p->term_len - (b - p->term_buf);
194 map = zebra_maps_input(zm, &b, remain, 0);
202 if (last && start != last && zebra_maps_is_index(zm))
203 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
204 start, last - start);
207 static void snippet_add_incomplete_field(RecWord *p, int ord, zebra_map_t zm)
209 struct snip_rec_info *h = p->extractCtrl->handle;
210 const char *b = p->term_buf;
211 int remain = p->term_len;
213 const char **map = 0;
214 const char *start = b;
215 const char *last = b;
218 map = zebra_maps_input(zm, &b, remain, 0);
222 char buf[IT_MAX_WORD+1];
226 while (map && *map && **map == *CHR_SPACE)
228 remain = p->term_len - (b - p->term_buf);
231 map = zebra_maps_input(zm, &b, remain, 0);
237 if (start != last && zebra_maps_is_index(zm))
239 zebra_snippets_appendn(h->snippets, p->seqno, 1, ord,
240 start, last - start);
246 while (map && *map && **map != *CHR_SPACE)
248 const char *cp = *map;
250 while (i < IT_MAX_WORD && *cp)
252 remain = p->term_len - (b - p->term_buf);
255 map = zebra_maps_input(zm, &b, remain, 0);
265 if (zebra_maps_is_first_in_field(zm))
267 /* first in field marker */
271 if (start != last && zebra_maps_is_index(zm))
272 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
273 start, last - start);
280 static void snippet_add_icu(RecWord *p, int ord, zebra_map_t zm)
282 struct snip_rec_info *h = p->extractCtrl->handle;
284 const char *res_buf = 0;
287 const char *display_buf = 0;
288 size_t display_len = 0;
290 zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
291 while (zebra_map_tokenize_next(zm, &res_buf, &res_len,
292 &display_buf, &display_len))
294 if (zebra_maps_is_index(zm))
295 zebra_snippets_appendn(h->snippets, p->seqno, 0, ord,
296 display_buf, display_len);
301 static void snippet_token_add(RecWord *p)
303 struct snip_rec_info *h = p->extractCtrl->handle;
304 ZebraHandle zh = h->zh;
305 zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
309 ZebraExplainInfo zei = zh->reg->zei;
310 int ch = zebraExplain_lookup_attr_str(
311 zei, zinfo_index_category_index, p->index_type, p->index_name);
313 if (zebra_maps_is_icu(zm))
314 snippet_add_icu(p, ch, zm);
317 if (zebra_maps_is_complete(zm))
318 snippet_add_complete_field(p, ch, zm);
320 snippet_add_incomplete_field(p, ch, zm);
325 static void snippet_schema_add(
326 struct recExtractCtrl *p, Odr_oid *oid)
331 void extract_snippet(ZebraHandle zh, zebra_snippets *sn,
332 struct ZebraRecStream *stream,
333 RecType rt, void *recTypeClientData)
335 struct recExtractCtrl extractCtrl;
336 struct snip_rec_info info;
339 extractCtrl.stream = stream;
340 extractCtrl.first_record = 1;
341 extractCtrl.init = extract_init;
342 extractCtrl.tokenAdd = snippet_token_add;
343 extractCtrl.schemaAdd = snippet_schema_add;
347 extractCtrl.dh = zh->reg->dh;
351 extractCtrl.handle = &info;
352 extractCtrl.match_criteria[0] = '\0';
353 extractCtrl.staticrank = 0;
354 extractCtrl.action = action_insert;
356 init_extractCtrl(zh, &extractCtrl);
358 extractCtrl.setStoreData = 0;
360 r = (*rt->extract)(recTypeClientData, &extractCtrl);
364 static void searchRecordKey(ZebraHandle zh,
365 zebra_rec_keys_t reckeys,
366 const char *index_name,
367 const char **ws, int ws_length)
371 zinfo_index_category_t cat = zinfo_index_category_index;
373 for (i = 0; i<ws_length; i++)
377 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "0", index_name);
379 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "p", index_name);
381 ch = zebraExplain_lookup_attr_str(zh->reg->zei, cat, "w", index_name);
386 if (zebra_rec_keys_rewind(reckeys))
393 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
395 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
397 seqno = key.mem[key.len-1];
399 if (key.mem[0] == ch)
405 woff = seqno - startSeq;
406 if (woff >= 0 && woff < ws_length)
413 #define FILE_MATCH_BLANK "\t "
415 static char *get_match_from_spec(ZebraHandle zh,
416 zebra_rec_keys_t reckeys,
417 const char *fname, const char *spec)
419 static char dstBuf[2048]; /* static here ??? */
421 const char *s = spec;
425 for (; *s && strchr(FILE_MATCH_BLANK, *s); s++)
432 char attset_str[64], attname_str[64];
436 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
438 for (i = 0; *s && *s != ',' && *s != ')' &&
439 !strchr(FILE_MATCH_BLANK, *s); s++)
440 if (i+1 < sizeof(attset_str))
441 attset_str[i++] = *s;
442 attset_str[i] = '\0';
444 for (; strchr(FILE_MATCH_BLANK, *s); s++)
447 strcpy(attname_str, attset_str);
450 for (s++; strchr(FILE_MATCH_BLANK, *s); s++)
452 for (i = 0; *s && *s != ')' &&
453 !strchr(FILE_MATCH_BLANK, *s); s++)
454 if (i+1 < sizeof(attname_str))
455 attname_str[i++] = *s;
456 attname_str[i] = '\0';
460 yaz_log(YLOG_WARN, "Missing ) in match criteria %s in group %s",
461 spec, zh->m_group ? zh->m_group : "none");
466 searchRecordKey(zh, reckeys, attname_str, ws, 32);
467 if (0) /* for debugging */
469 for (i = 0; i<32; i++)
473 WRBUF w = wrbuf_hex_str(ws[i]);
474 yaz_log(YLOG_LOG, "ws[%d] = %s", i, wrbuf_cstr(w));
480 for (i = 0; i<32; i++)
489 dst += strlen(ws[i]);
493 yaz_log(YLOG_WARN, "Record didn't contain match"
494 " fields in (%s,%s)", attset_str, attname_str);
502 const char *spec_src = NULL;
503 const char *s1 = ++s;
504 while (*s1 && !strchr(FILE_MATCH_BLANK, *s1))
508 if (spec_len > sizeof(special)-1)
509 spec_len = sizeof(special)-1;
510 memcpy(special, s, spec_len);
511 special[spec_len] = '\0';
514 if (!strcmp(special, "group"))
515 spec_src = zh->m_group;
516 else if (!strcmp(special, "database"))
517 spec_src = zh->basenames[0];
518 else if (!strcmp(special, "filename")) {
521 else if (!strcmp(special, "type"))
522 spec_src = zh->m_record_type;
527 strcpy(dst, spec_src);
528 dst += strlen(spec_src);
531 else if (*s == '\"' || *s == '\'')
533 int stopMarker = *s++;
537 while (*s && *s != stopMarker)
539 if (i+1 < sizeof(tmpString))
540 tmpString[i++] = *s++;
545 strcpy(dst, tmpString);
546 dst += strlen(tmpString);
550 yaz_log(YLOG_WARN, "Syntax error in match criteria %s in group %s",
551 spec, zh->m_group ? zh->m_group : "none");
558 yaz_log(YLOG_WARN, "No match criteria for record %s in group %s",
559 fname, zh->m_group ? zh->m_group : "none");
564 if (0) /* for debugging */
566 WRBUF w = wrbuf_hex_str(dstBuf);
567 yaz_log(YLOG_LOG, "get_match_from_spec %s", wrbuf_cstr(w));
574 struct recordLogInfo {
577 struct recordGroup *rGroup;
580 /** \brief add the always-matches index entry and map to real record ID
581 \param ctrl record control
582 \param record_id custom record ID
583 \param sysno system record ID
585 This function serves two purposes.. It adds the always matches
586 entry and makes a pointer from the custom record ID (if defined)
587 back to the system record ID (sysno)
588 See zebra_recid_to_sysno .
590 static void all_matches_add(struct recExtractCtrl *ctrl, zint record_id,
594 extract_init(ctrl, &word);
595 word.record_id = record_id;
596 /* we use the seqno as placeholder for a way to get back to
597 record database from _ALLRECORDS.. This is used if a custom
598 RECORD was defined */
600 word.index_name = "_ALLRECORDS";
601 word.index_type = "w";
603 extract_add_index_string(&word, zinfo_index_category_alwaysmatches,
607 /* forward declaration */
608 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
609 struct ZebraRecStream *stream,
610 enum zebra_recctrl_action_t action,
611 const char *recordType,
613 const char *match_criteria,
616 void *recTypeClientData);
619 ZEBRA_RES zebra_extract_file(ZebraHandle zh, zint *sysno, const char *fname,
620 enum zebra_recctrl_action_t action)
622 ZEBRA_RES r = ZEBRA_OK;
627 struct file_read_info *fi = 0;
628 const char *original_record_type = 0;
630 void *recTypeClientData;
631 struct ZebraRecStream stream, *streamp;
633 zebra_init_log_level();
635 if (!zh->m_group || !*zh->m_group)
638 sprintf(gprefix, "%s.", zh->m_group);
640 yaz_log(log_level_extract, "zebra_extract_file %s", fname);
642 /* determine file extension */
644 for (i = strlen(fname); --i >= 0; )
647 else if (fname[i] == '.')
649 strcpy(ext, fname+i+1);
652 /* determine file type - depending on extension */
653 original_record_type = zh->m_record_type;
654 if (!zh->m_record_type)
656 sprintf(ext_res, "%srecordType.%s", gprefix, ext);
657 zh->m_record_type = res_get(zh->res, ext_res);
659 if (!zh->m_record_type)
662 if (zh->records_processed + zh->records_skipped
663 < zh->m_file_verbose_limit)
664 yaz_log(YLOG_LOG, "? %s", fname);
665 zh->records_skipped++;
668 /* determine match criteria */
669 if (!zh->m_record_id)
671 sprintf(ext_res, "%srecordId.%s", gprefix, ext);
672 zh->m_record_id = res_get(zh->res, ext_res);
676 recType_byName(zh->reg->recTypes, zh->res, zh->m_record_type,
677 &recTypeClientData)))
679 yaz_log(YLOG_WARN, "No such record type: %s", zh->m_record_type);
683 switch(recType->version)
688 yaz_log(YLOG_WARN, "Bad filter version: %s", zh->m_record_type);
690 if (sysno && (action == action_delete || action == action_a_delete))
699 if (zh->path_reg && !yaz_is_abspath(fname))
701 strcpy(full_rep, zh->path_reg);
702 strcat(full_rep, "/");
703 strcat(full_rep, fname);
706 strcpy(full_rep, fname);
708 if ((fd = open(full_rep, O_BINARY|O_RDONLY)) == -1)
710 yaz_log(YLOG_WARN|YLOG_ERRNO, "open %s", full_rep);
711 zh->m_record_type = original_record_type;
715 zebra_create_stream_fd(streamp, fd, 0);
717 r = zebra_extract_records_stream(zh, streamp,
721 0, /*match_criteria */
723 recType, recTypeClientData);
725 stream.destroy(streamp);
726 zh->m_record_type = original_record_type;
731 If sysno is provided, then it's used to identify the reocord.
732 If not, and match_criteria is provided, then sysno is guessed
733 If not, and a record is provided, then sysno is got from there
737 ZEBRA_RES zebra_buffer_extract_record(ZebraHandle zh,
738 const char *buf, size_t buf_size,
739 enum zebra_recctrl_action_t action,
740 const char *recordType,
742 const char *match_criteria,
745 struct ZebraRecStream stream;
750 if (recordType && *recordType)
752 yaz_log(log_level_extract,
753 "Record type explicitly specified: %s", recordType);
754 recType = recType_byName(zh->reg->recTypes, zh->res, recordType,
759 if (!(zh->m_record_type))
761 yaz_log(YLOG_WARN, "No such record type defined");
764 yaz_log(log_level_extract, "Get record type from rgroup: %s",
766 recType = recType_byName(zh->reg->recTypes, zh->res,
767 zh->m_record_type, &clientData);
768 recordType = zh->m_record_type;
773 yaz_log(YLOG_WARN, "No such record type: %s", recordType);
777 zebra_create_stream_mem(&stream, buf, buf_size);
779 res = zebra_extract_records_stream(zh, &stream,
785 recType, clientData);
786 stream.destroy(&stream);
790 static ZEBRA_RES zebra_extract_record_stream(ZebraHandle zh,
791 struct ZebraRecStream *stream,
792 enum zebra_recctrl_action_t action,
793 const char *recordType,
795 const char *match_criteria,
798 void *recTypeClientData,
803 RecordAttr *recordAttr;
804 struct recExtractCtrl extractCtrl;
806 const char *matchStr = 0;
808 off_t start_offset = 0, end_offset = 0;
809 const char *pr_fname = fname; /* filename to print .. */
810 int show_progress = zh->records_processed + zh->records_skipped
811 < zh->m_file_verbose_limit ? 1:0;
813 zebra_init_log_level();
816 pr_fname = "<no file>"; /* make it printable if file is omitted */
818 zebra_rec_keys_reset(zh->reg->keys);
819 zebra_rec_keys_reset(zh->reg->sortKeys);
821 if (zebraExplain_curDatabase(zh->reg->zei, zh->basenames[0]))
823 if (zebraExplain_newDatabase(zh->reg->zei, zh->basenames[0],
824 zh->m_explain_database))
830 off_t null_offset = 0;
831 extractCtrl.stream = stream;
833 start_offset = stream->tellf(stream);
835 extractCtrl.first_record = start_offset ? 0 : 1;
837 stream->endf(stream, &null_offset);;
839 extractCtrl.init = extract_init;
840 extractCtrl.tokenAdd = extract_token_add;
841 extractCtrl.schemaAdd = extract_schema_add;
842 extractCtrl.dh = zh->reg->dh;
843 extractCtrl.handle = zh;
844 extractCtrl.match_criteria[0] = '\0';
845 extractCtrl.staticrank = 0;
846 extractCtrl.action = action;
848 init_extractCtrl(zh, &extractCtrl);
850 extract_set_store_data_prepare(&extractCtrl);
852 r = (*recType->extract)(recTypeClientData, &extractCtrl);
854 if (action == action_update)
856 action = extractCtrl.action;
861 case RECCTRL_EXTRACT_EOF:
863 case RECCTRL_EXTRACT_ERROR_GENERIC:
864 /* error occured during extraction ... */
865 yaz_log(YLOG_WARN, "extract error: generic");
867 case RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER:
868 /* error occured during extraction ... */
869 yaz_log(YLOG_WARN, "extract error: no such filter");
871 case RECCTRL_EXTRACT_SKIP:
873 yaz_log(YLOG_LOG, "skip %s %s " ZINT_FORMAT,
874 recordType, pr_fname, (zint) start_offset);
877 end_offset = stream->endf(stream, 0);
879 stream->seekf(stream, end_offset);
882 case RECCTRL_EXTRACT_OK:
885 yaz_log(YLOG_WARN, "extract error: unknown error: %d", r);
888 end_offset = stream->endf(stream, 0);
890 stream->seekf(stream, end_offset);
892 end_offset = stream->tellf(stream);
894 if (extractCtrl.match_criteria[0])
895 match_criteria = extractCtrl.match_criteria;
900 if (zh->m_flag_rw == 0)
902 yaz_log(YLOG_LOG, "test %s %s " ZINT_FORMAT, recordType,
903 pr_fname, (zint) start_offset);
904 /* test mode .. Do not perform match */
912 if (match_criteria && *match_criteria)
913 matchStr = match_criteria;
916 if (zh->m_record_id && *zh->m_record_id)
918 matchStr = get_match_from_spec(zh, zh->reg->keys, pr_fname,
922 yaz_log(YLOG_LOG, "error %s %s " ZINT_FORMAT, recordType,
923 pr_fname, (zint) start_offset);
928 WRBUF w = wrbuf_alloc();
930 for (i = 0; i < strlen(matchStr); i++)
932 wrbuf_printf(w, "%02X", matchStr[i] & 0xff);
934 yaz_log(YLOG_LOG, "Got match %s", wrbuf_cstr(w));
941 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
942 char *rinfo = dict_lookup_ord(zh->reg->matchDict, db_ord,
946 if (log_level_extract)
948 WRBUF w = wrbuf_hex_str(matchStr);
949 yaz_log(log_level_extract, "matchStr: %s", wrbuf_cstr(w));
954 assert(*rinfo == sizeof(*sysno));
955 memcpy(sysno, rinfo+1, sizeof(*sysno));
962 /* new record AKA does not exist already */
963 if (action == action_delete)
965 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
966 pr_fname, (zint) start_offset);
967 yaz_log(YLOG_WARN, "cannot delete record above (seems new)");
970 else if (action == action_a_delete)
973 yaz_log(YLOG_LOG, "adelete %s %s " ZINT_FORMAT, recordType,
974 pr_fname, (zint) start_offset);
977 else if (action == action_replace)
979 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
980 pr_fname, (zint) start_offset);
981 yaz_log(YLOG_WARN, "cannot update record above (seems new)");
985 yaz_log(YLOG_LOG, "add %s %s " ZINT_FORMAT, recordType, pr_fname,
986 (zint) start_offset);
987 rec = rec_new(zh->reg->records);
994 all_matches_add(&extractCtrl,
995 zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1000 recordAttr = rec_init_attr(zh->reg->zei, rec);
1001 if (extractCtrl.staticrank < 0)
1003 yaz_log(YLOG_WARN, "Negative staticrank for record. Set to 0");
1004 extractCtrl.staticrank = 0;
1009 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1010 dict_insert_ord(zh->reg->matchDict, db_ord, matchStr,
1011 sizeof(*sysno), sysno);
1014 extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1016 extract_flush_record_keys2(zh, *sysno,
1017 zh->reg->keys, extractCtrl.staticrank,
1018 0, recordAttr->staticrank);
1020 extract_flush_record_keys(zh, *sysno, 1, zh->reg->keys,
1021 extractCtrl.staticrank);
1023 recordAttr->staticrank = extractCtrl.staticrank;
1024 zh->records_inserted++;
1028 /* record already exists */
1029 zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1030 zebra_rec_keys_t sortKeys = zebra_rec_keys_open();
1031 if (action == action_insert)
1033 yaz_log(YLOG_LOG, "skipped %s %s " ZINT_FORMAT,
1034 recordType, pr_fname, (zint) start_offset);
1039 rec = rec_get(zh->reg->records, *sysno);
1044 all_matches_add(&extractCtrl,
1045 zebra_rec_keys_get_custom_record_id(zh->reg->keys),
1049 recordAttr = rec_init_attr(zh->reg->zei, rec);
1051 /* decrease total size */
1052 zebraExplain_recordBytesIncrement(zh->reg->zei,
1053 - recordAttr->recordSize);
1055 zebra_rec_keys_set_buf(delkeys,
1056 rec->info[recInfo_delKeys],
1057 rec->size[recInfo_delKeys],
1059 zebra_rec_keys_set_buf(sortKeys,
1060 rec->info[recInfo_sortKeys],
1061 rec->size[recInfo_sortKeys],
1064 extract_flush_sort_keys(zh, *sysno, 0, sortKeys);
1066 extract_flush_record_keys(zh, *sysno, 0, delkeys,
1067 recordAttr->staticrank);
1069 if (action == action_delete || action == action_a_delete)
1071 /* record going to be deleted */
1073 extract_flush_record_keys2(zh, *sysno, 0, recordAttr->staticrank,
1074 delkeys, recordAttr->staticrank);
1076 if (zebra_rec_keys_empty(delkeys))
1078 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1079 pr_fname, (zint) start_offset);
1080 yaz_log(YLOG_WARN, "cannot delete file above, "
1081 "storeKeys false (3)");
1086 yaz_log(YLOG_LOG, "delete %s %s " ZINT_FORMAT, recordType,
1087 pr_fname, (zint) start_offset);
1088 zh->records_deleted++;
1091 int db_ord = zebraExplain_get_database_ord(zh->reg->zei);
1092 dict_delete_ord(zh->reg->matchDict, db_ord, matchStr);
1094 rec_del(zh->reg->records, &rec);
1096 zebra_rec_keys_close(delkeys);
1097 zebra_rec_keys_close(sortKeys);
1103 { /* update or special_update */
1105 yaz_log(YLOG_LOG, "update %s %s " ZINT_FORMAT, recordType,
1106 pr_fname, (zint) start_offset);
1107 extract_flush_sort_keys(zh, *sysno, 1, zh->reg->sortKeys);
1110 extract_flush_record_keys2(zh, *sysno,
1111 zh->reg->keys, extractCtrl.staticrank,
1112 delkeys, recordAttr->staticrank);
1114 extract_flush_record_keys(zh, *sysno, 1,
1115 zh->reg->keys, extractCtrl.staticrank);
1117 recordAttr->staticrank = extractCtrl.staticrank;
1118 zh->records_updated++;
1120 zebra_rec_keys_close(delkeys);
1121 zebra_rec_keys_close(sortKeys);
1123 /* update file type */
1124 xfree(rec->info[recInfo_fileType]);
1125 rec->info[recInfo_fileType] =
1126 rec_strdup(recordType, &rec->size[recInfo_fileType]);
1128 /* update filename */
1129 xfree(rec->info[recInfo_filename]);
1130 rec->info[recInfo_filename] =
1131 rec_strdup(fname, &rec->size[recInfo_filename]);
1133 /* update delete keys */
1134 xfree(rec->info[recInfo_delKeys]);
1135 if (!zebra_rec_keys_empty(zh->reg->keys) && zh->m_store_keys == 1)
1137 zebra_rec_keys_get_buf(zh->reg->keys,
1138 &rec->info[recInfo_delKeys],
1139 &rec->size[recInfo_delKeys]);
1143 rec->info[recInfo_delKeys] = NULL;
1144 rec->size[recInfo_delKeys] = 0;
1146 /* update sort keys */
1147 xfree(rec->info[recInfo_sortKeys]);
1149 zebra_rec_keys_get_buf(zh->reg->sortKeys,
1150 &rec->info[recInfo_sortKeys],
1151 &rec->size[recInfo_sortKeys]);
1155 recordAttr->recordSize = end_offset - start_offset;
1156 zebraExplain_recordBytesIncrement(zh->reg->zei,
1157 recordAttr->recordSize);
1160 /* set run-number for this record */
1161 recordAttr->runNumber =
1162 zebraExplain_runNumberIncrement(zh->reg->zei, 0);
1164 /* update store data */
1165 xfree(rec->info[recInfo_storeData]);
1167 /* update store data */
1168 if (zh->store_data_buf)
1170 rec->size[recInfo_storeData] = zh->store_data_size;
1171 rec->info[recInfo_storeData] = zh->store_data_buf;
1172 zh->store_data_buf = 0;
1173 recordAttr->recordSize = zh->store_data_size;
1175 else if (zh->m_store_data)
1177 off_t cur_offset = stream->tellf(stream);
1179 rec->size[recInfo_storeData] = recordAttr->recordSize;
1180 rec->info[recInfo_storeData] = (char *)
1181 xmalloc(recordAttr->recordSize);
1182 stream->seekf(stream, start_offset);
1183 stream->readf(stream, rec->info[recInfo_storeData],
1184 recordAttr->recordSize);
1185 stream->seekf(stream, cur_offset);
1189 rec->info[recInfo_storeData] = NULL;
1190 rec->size[recInfo_storeData] = 0;
1192 /* update database name */
1193 xfree(rec->info[recInfo_databaseName]);
1194 rec->info[recInfo_databaseName] =
1195 rec_strdup(zh->basenames[0], &rec->size[recInfo_databaseName]);
1198 recordAttr->recordOffset = start_offset;
1200 /* commit this record */
1201 rec_put(zh->reg->records, &rec);
1206 /** \brief extracts records from stream
1207 \param zh Zebra Handle
1208 \param stream stream that we read from
1209 \param action (action_insert, action_replace, action_delete, ..)
1210 \param recordType Record filter type "grs.xml", etc.
1211 \param sysno pointer to sysno if already known; NULL otherwise
1212 \param match_criteria (NULL if not already given)
1213 \param fname filename that we read from (for logging purposes only)
1214 \param recType record type
1215 \param recTypeClientData client data for record type
1216 \returns ZEBRA_OK for success; ZEBRA_FAIL for failure
1218 ZEBRA_RES zebra_extract_records_stream(ZebraHandle zh,
1219 struct ZebraRecStream *stream,
1220 enum zebra_recctrl_action_t action,
1221 const char *recordType,
1223 const char *match_criteria,
1226 void *recTypeClientData)
1228 ZEBRA_RES res = ZEBRA_OK;
1232 res = zebra_extract_record_stream(zh, stream,
1238 recType, recTypeClientData, &more);
1244 if (res != ZEBRA_OK)
1252 ZEBRA_RES zebra_extract_explain(void *handle, Record rec, data1_node *n)
1254 ZebraHandle zh = (ZebraHandle) handle;
1255 struct recExtractCtrl extractCtrl;
1257 if (zebraExplain_curDatabase(zh->reg->zei,
1258 rec->info[recInfo_databaseName]))
1261 if (zebraExplain_newDatabase(zh->reg->zei,
1262 rec->info[recInfo_databaseName], 0))
1266 zebra_rec_keys_reset(zh->reg->keys);
1267 zebra_rec_keys_reset(zh->reg->sortKeys);
1269 extractCtrl.init = extract_init;
1270 extractCtrl.tokenAdd = extract_token_add;
1271 extractCtrl.schemaAdd = extract_schema_add;
1272 extractCtrl.dh = zh->reg->dh;
1274 init_extractCtrl(zh, &extractCtrl);
1276 extractCtrl.flagShowRecords = 0;
1277 extractCtrl.match_criteria[0] = '\0';
1278 extractCtrl.staticrank = 0;
1279 extractCtrl.action = action_update;
1281 extractCtrl.handle = handle;
1282 extractCtrl.first_record = 1;
1284 extract_set_store_data_prepare(&extractCtrl);
1287 grs_extract_tree(&extractCtrl, n);
1289 if (rec->size[recInfo_delKeys])
1291 zebra_rec_keys_t delkeys = zebra_rec_keys_open();
1293 zebra_rec_keys_t sortkeys = zebra_rec_keys_open();
1295 zebra_rec_keys_set_buf(delkeys, rec->info[recInfo_delKeys],
1296 rec->size[recInfo_delKeys],
1299 extract_flush_record_keys2(zh, rec->sysno,
1300 zh->reg->keys, 0, delkeys, 0);
1302 extract_flush_record_keys(zh, rec->sysno, 0, delkeys, 0);
1303 extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1305 zebra_rec_keys_close(delkeys);
1307 zebra_rec_keys_set_buf(sortkeys, rec->info[recInfo_sortKeys],
1308 rec->size[recInfo_sortKeys],
1311 extract_flush_sort_keys(zh, rec->sysno, 0, sortkeys);
1312 zebra_rec_keys_close(sortkeys);
1317 extract_flush_record_keys2(zh, rec->sysno, zh->reg->keys, 0, 0, 0);
1319 extract_flush_record_keys(zh, rec->sysno, 1, zh->reg->keys, 0);
1322 extract_flush_sort_keys(zh, rec->sysno, 1, zh->reg->sortKeys);
1324 xfree(rec->info[recInfo_delKeys]);
1325 zebra_rec_keys_get_buf(zh->reg->keys,
1326 &rec->info[recInfo_delKeys],
1327 &rec->size[recInfo_delKeys]);
1329 xfree(rec->info[recInfo_sortKeys]);
1330 zebra_rec_keys_get_buf(zh->reg->sortKeys,
1331 &rec->info[recInfo_sortKeys],
1332 &rec->size[recInfo_sortKeys]);
1336 void zebra_it_key_str_dump(ZebraHandle zh, struct it_key *key,
1337 const char *str, size_t slen, NMEM nmem, int level)
1339 char keystr[200]; /* room for zints to print */
1341 int ord = CAST_ZINT_TO_INT(key->mem[0]);
1342 const char *index_type;
1344 const char *string_index;
1346 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1347 0/* db */, &string_index);
1349 zebra_term_untrans_iconv(zh, nmem, index_type,
1352 for (i = 0; i < key->len; i++)
1354 sprintf(keystr + strlen(keystr), ZINT_FORMAT " ", key->mem[i]);
1357 if (*str < CHR_BASE_CHAR)
1360 char dst_buf[200]; /* room for special chars */
1362 strcpy(dst_buf , "?");
1364 if (!strcmp(str, ""))
1365 strcpy(dst_buf, "alwaysmatches");
1366 if (!strcmp(str, FIRST_IN_FIELD_STR))
1367 strcpy(dst_buf, "firstinfield");
1368 else if (!strcmp(str, CHR_UNKNOWN))
1369 strcpy(dst_buf, "unknown");
1370 else if (!strcmp(str, CHR_SPACE))
1371 strcpy(dst_buf, "space");
1373 for (i = 0; i<slen; i++)
1375 sprintf(dst_buf + strlen(dst_buf), " %d", str[i] & 0xff);
1377 yaz_log(level, "%s%s %s %s", keystr, index_type,
1378 string_index, dst_buf);
1382 yaz_log(level, "%s%s %s \"%s\"", keystr, index_type,
1383 string_index, dst_term);
1386 void extract_rec_keys_log(ZebraHandle zh, int is_insert,
1387 zebra_rec_keys_t reckeys,
1390 if (zebra_rec_keys_rewind(reckeys))
1395 NMEM nmem = nmem_create();
1397 while(zebra_rec_keys_read(reckeys, &str, &slen, &key))
1399 zebra_it_key_str_dump(zh, &key, str, slen, nmem, level);
1406 void extract_rec_keys_adjust(ZebraHandle zh, int is_insert,
1407 zebra_rec_keys_t reckeys)
1409 ZebraExplainInfo zei = zh->reg->zei;
1413 struct ord_stat *next;
1416 if (zebra_rec_keys_rewind(reckeys))
1418 struct ord_stat *ord_list = 0;
1422 struct it_key key_in;
1423 while(zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1425 int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1427 for (p = ord_list; p ; p = p->next)
1435 p = xmalloc(sizeof(*p));
1446 struct ord_stat *p1 = p;
1449 zebraExplain_ord_adjust_occurrences(zei, p->ord, p->no, 1);
1451 zebraExplain_ord_adjust_occurrences(zei, p->ord, - p->no, -1);
1458 void extract_flush_record_keys2(ZebraHandle zh, zint sysno,
1459 zebra_rec_keys_t ins_keys, zint ins_rank,
1460 zebra_rec_keys_t del_keys, zint del_rank)
1462 ZebraExplainInfo zei = zh->reg->zei;
1466 if (!zh->reg->key_block)
1468 int mem = 1024*1024 * atoi( res_get_def( zh->res, "memmax", "8"));
1469 const char *key_tmp_dir = res_get_def(zh->res, "keyTmpDir", ".");
1470 int use_threads = atoi(res_get_def(zh->res, "threads", "1"));
1471 zh->reg->key_block = key_block_create(mem, key_tmp_dir, use_threads);
1476 extract_rec_keys_adjust(zh, 1, ins_keys);
1478 zebraExplain_recordCountIncrement(zei, 1);
1479 zebra_rec_keys_rewind(ins_keys);
1483 extract_rec_keys_adjust(zh, 0, del_keys);
1485 zebraExplain_recordCountIncrement(zei, -1);
1486 zebra_rec_keys_rewind(del_keys);
1492 const char *del_str;
1493 struct it_key del_key_in;
1497 const char *ins_str;
1498 struct it_key ins_key_in;
1502 del = zebra_rec_keys_read(del_keys, &del_str, &del_slen,
1505 ins = zebra_rec_keys_read(ins_keys, &ins_str, &ins_slen,
1508 if (del && ins && ins_rank == del_rank
1509 && !key_compare(&del_key_in, &ins_key_in)
1510 && ins_slen == del_slen && !memcmp(del_str, ins_str, del_slen))
1520 key_block_write(zh->reg->key_block, sysno,
1521 &del_key_in, 0, del_str, del_slen,
1522 del_rank, zh->m_staticrank);
1524 key_block_write(zh->reg->key_block, sysno,
1525 &ins_key_in, 1, ins_str, ins_slen,
1526 ins_rank, zh->m_staticrank);
1528 yaz_log(log_level_extract, "normal=%d optimized=%d", normal, optimized);
1532 ZEBRA_RES zebra_rec_keys_to_snippets1(ZebraHandle zh,
1533 zebra_rec_keys_t reckeys,
1534 zebra_snippets *snippets)
1536 NMEM nmem = nmem_create();
1537 if (zebra_rec_keys_rewind(reckeys))
1542 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1547 const char *index_type;
1549 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1550 seqno = key.mem[key.len-1];
1551 ord = CAST_ZINT_TO_INT(key.mem[0]);
1553 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type,
1554 0/* db */, 0 /* string_index */);
1556 zebra_term_untrans_iconv(zh, nmem, index_type,
1558 zebra_snippets_append(snippets, seqno, 0, ord, dst_term);
1566 void print_rec_keys(ZebraHandle zh, zebra_rec_keys_t reckeys)
1568 yaz_log(YLOG_LOG, "print_rec_keys");
1569 if (zebra_rec_keys_rewind(reckeys))
1574 while (zebra_rec_keys_read(reckeys, &str, &slen, &key))
1576 char dst_buf[IT_MAX_WORD];
1578 const char *index_type;
1579 int ord = CAST_ZINT_TO_INT(key.mem[0]);
1581 assert(key.len <= IT_KEY_LEVEL_MAX && key.len > 2);
1583 zebraExplain_lookup_ord(zh->reg->zei, ord, &index_type, &db, 0);
1585 seqno = key.mem[key.len-1];
1587 zebra_term_untrans(zh, index_type, dst_buf, str);
1589 yaz_log(YLOG_LOG, "ord=%d seqno=" ZINT_FORMAT
1590 " term=%s", ord, seqno, dst_buf);
1595 static void extract_add_index_string(RecWord *p, zinfo_index_category_t cat,
1596 const char *str, int length)
1599 ZebraHandle zh = p->extractCtrl->handle;
1600 ZebraExplainInfo zei = zh->reg->zei;
1603 ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1605 ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1609 key.mem[i++] = p->record_id;
1610 key.mem[i++] = p->section_id;
1612 if (zh->m_segment_indexing)
1613 key.mem[i++] = p->segment;
1614 key.mem[i++] = p->seqno;
1617 zebra_rec_keys_write(zh->reg->keys, str, length, &key);
1620 static void extract_add_sort_string(RecWord *p, const char *str, int length)
1623 ZebraHandle zh = p->extractCtrl->handle;
1624 ZebraExplainInfo zei = zh->reg->zei;
1626 zinfo_index_category_t cat = zinfo_index_category_sort;
1628 ch = zebraExplain_lookup_attr_str(zei, cat, p->index_type, p->index_name);
1630 ch = zebraExplain_add_attr_str(zei, cat, p->index_type, p->index_name);
1633 key.mem[1] = p->record_id;
1634 key.mem[2] = p->section_id;
1636 zebra_rec_keys_write(zh->reg->sortKeys, str, length, &key);
1639 static void extract_add_staticrank_string(RecWord *p,
1640 const char *str, int length)
1643 struct recExtractCtrl *ctrl = p->extractCtrl;
1645 if (length > sizeof(valz)-1)
1646 length = sizeof(valz)-1;
1648 memcpy(valz, str, length);
1649 valz[length] = '\0';
1650 ctrl->staticrank = atozint(valz);
1653 static void extract_add_string(RecWord *p, zebra_map_t zm,
1654 const char *string, int length)
1660 if (log_level_details)
1663 WRBUF w = wrbuf_alloc();
1665 wrbuf_write_escaped(w, string, length);
1666 yaz_log(log_level_details, "extract_add_string: %s", wrbuf_cstr(w));
1669 if (zebra_maps_is_index(zm))
1671 extract_add_index_string(p, zinfo_index_category_index,
1673 if (zebra_maps_is_alwaysmatches(zm))
1676 memcpy(&word, p, sizeof(word));
1679 extract_add_index_string(
1680 &word, zinfo_index_category_alwaysmatches, "", 0);
1683 else if (zebra_maps_is_sort(zm))
1685 extract_add_sort_string(p, string, length);
1687 else if (zebra_maps_is_staticrank(zm))
1689 extract_add_staticrank_string(p, string, length);
1693 static void extract_add_incomplete_field(RecWord *p, zebra_map_t zm)
1695 const char *b = p->term_buf;
1696 int remain = p->term_len;
1698 const char **map = 0;
1701 map = zebra_maps_input(zm, &b, remain, 0);
1705 char buf[IT_MAX_WORD+1];
1709 while (map && *map && **map == *CHR_SPACE)
1711 remain = p->term_len - (b - p->term_buf);
1713 map = zebra_maps_input(zm, &b, remain, 0);
1720 while (map && *map && **map != *CHR_SPACE)
1722 const char *cp = *map;
1724 while (i < IT_MAX_WORD && *cp)
1726 remain = p->term_len - (b - p->term_buf);
1728 map = zebra_maps_input(zm, &b, remain, 0);
1738 if (zebra_maps_is_first_in_field(zm))
1740 /* first in field marker */
1741 extract_add_string(p, zm, FIRST_IN_FIELD_STR, FIRST_IN_FIELD_LEN);
1745 extract_add_string(p, zm, buf, i);
1750 static void extract_add_complete_field(RecWord *p, zebra_map_t zm)
1752 const char *b = p->term_buf;
1753 char buf[IT_MAX_WORD+1];
1754 const char **map = 0;
1755 int i = 0, remain = p->term_len;
1758 map = zebra_maps_input(zm, &b, remain, 1);
1760 while (remain > 0 && i < IT_MAX_WORD)
1762 while (map && *map && **map == *CHR_SPACE)
1764 remain = p->term_len - (b - p->term_buf);
1768 int first = i ? 0 : 1; /* first position */
1769 map = zebra_maps_input(zm, &b, remain, first);
1777 if (i && i < IT_MAX_WORD)
1778 buf[i++] = *CHR_SPACE;
1779 while (map && *map && **map != *CHR_SPACE)
1781 const char *cp = *map;
1783 if (**map == *CHR_CUT)
1789 if (i >= IT_MAX_WORD)
1791 while (i < IT_MAX_WORD && *cp)
1794 remain = p->term_len - (b - p->term_buf);
1797 map = zebra_maps_input(zm, &b, remain, 0);
1805 extract_add_string(p, zm, buf, i);
1808 static void extract_add_icu(RecWord *p, zebra_map_t zm)
1810 const char *res_buf = 0;
1813 zebra_map_tokenize_start(zm, p->term_buf, p->term_len);
1814 while (zebra_map_tokenize_next(zm, &res_buf, &res_len, 0, 0))
1816 extract_add_string(p, zm, res_buf, res_len);
1822 /** \brief top-level indexing handler for recctrl system
1823 \param p token data to be indexed
1827 extract_add_{in}_complete / extract_add_icu
1830 extract_add_index_string
1832 extract_add_sort_string
1834 extract_add_staticrank_string
1837 static void extract_token_add(RecWord *p)
1839 ZebraHandle zh = p->extractCtrl->handle;
1840 zebra_map_t zm = zebra_map_get_or_add(zh->reg->zebra_maps, p->index_type);
1843 if (log_level_details)
1845 yaz_log(log_level_details, "extract_token_add "
1846 "type=%s index=%s seqno=" ZINT_FORMAT " s=%.*s",
1847 p->index_type, p->index_name,
1848 p->seqno, p->term_len, p->term_buf);
1850 if ((wrbuf = zebra_replace(zm, 0, p->term_buf, p->term_len)))
1852 p->term_buf = wrbuf_buf(wrbuf);
1853 p->term_len = wrbuf_len(wrbuf);
1855 if (zebra_maps_is_icu(zm))
1857 extract_add_icu(p, zm);
1861 if (zebra_maps_is_complete(zm))
1862 extract_add_complete_field(p, zm);
1864 extract_add_incomplete_field(p, zm);
1868 static void extract_set_store_data_cb(struct recExtractCtrl *p,
1869 void *buf, size_t sz)
1871 ZebraHandle zh = (ZebraHandle) p->handle;
1873 xfree(zh->store_data_buf);
1874 zh->store_data_buf = 0;
1875 zh->store_data_size = 0;
1878 zh->store_data_buf = xmalloc(sz);
1879 zh->store_data_size = sz;
1880 memcpy(zh->store_data_buf, buf, sz);
1884 static void extract_set_store_data_prepare(struct recExtractCtrl *p)
1886 ZebraHandle zh = (ZebraHandle) p->handle;
1887 xfree(zh->store_data_buf);
1888 zh->store_data_buf = 0;
1889 zh->store_data_size = 0;
1890 p->setStoreData = extract_set_store_data_cb;
1893 static void extract_schema_add(struct recExtractCtrl *p, Odr_oid *oid)
1895 ZebraHandle zh = (ZebraHandle) p->handle;
1896 zebraExplain_addSchema(zh->reg->zei, oid);
1899 void extract_flush_sort_keys(ZebraHandle zh, zint sysno,
1900 int cmd, zebra_rec_keys_t reckeys)
1903 yaz_log(YLOG_LOG, "extract_flush_sort_keys cmd=%d sysno=" ZINT_FORMAT,
1905 extract_rec_keys_log(zh, cmd, reckeys, YLOG_LOG);
1908 if (zebra_rec_keys_rewind(reckeys))
1910 zebra_sort_index_t si = zh->reg->sort_index;
1913 struct it_key key_in;
1915 NMEM nmem = nmem_create();
1916 struct sort_add_ent {
1919 struct sort_add_ent *next;
1924 struct sort_add_ent *sort_ent_list = 0;
1926 while (zebra_rec_keys_read(reckeys, &str, &slen, &key_in))
1928 int ord = CAST_ZINT_TO_INT(key_in.mem[0]);
1929 zint filter_sysno = key_in.mem[1];
1930 zint section_id = key_in.mem[2];
1932 struct sort_add_ent **e = &sort_ent_list;
1933 for (; *e; e = &(*e)->next)
1934 if ((*e)->ord == ord && section_id == (*e)->section_id)
1938 *e = nmem_malloc(nmem, sizeof(**e));
1940 (*e)->wrbuf = wrbuf_alloc();
1943 (*e)->sysno = filter_sysno ? filter_sysno : sysno;
1944 (*e)->section_id = section_id;
1947 wrbuf_write((*e)->wrbuf, str, slen);
1948 wrbuf_putc((*e)->wrbuf, '\0');
1952 zint last_sysno = 0;
1953 struct sort_add_ent *e = sort_ent_list;
1954 for (; e; e = e->next)
1956 if (last_sysno != e->sysno)
1958 zebra_sort_sysno(si, e->sysno);
1959 last_sysno = e->sysno;
1961 zebra_sort_type(si, e->ord);
1963 zebra_sort_add(si, e->section_id, e->wrbuf);
1965 zebra_sort_delete(si, e->section_id);
1966 wrbuf_destroy(e->wrbuf);
1976 * c-file-style: "Stroustrup"
1977 * indent-tabs-mode: nil
1979 * vim: shiftwidth=4 tabstop=8 expandtab