1 /* This file is part of the Zebra server.
2 Copyright (C) 1994-2011 Index Data
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 #include <yaz/options.h>
26 #include <yaz/xmalloc.h>
27 #include <yaz/marcdisp.h>
28 #include <yaz/timing.h>
30 #include <idzebra/isamb.h>
31 #include <idzebra/dict.h>
39 struct index_term *terms;
40 struct index_term **ar;
49 struct index_term *next;
52 struct index_block *index_block_new(int memory)
54 struct index_block *b = xmalloc(sizeof(*b));
56 b->current_max = memory * 1024 * 1024;
58 b->nmem = nmem_create();
63 void index_block_destroy(struct index_block **bp)
67 nmem_destroy((*bp)->nmem);
73 static int cmp_ar(const void *p1, const void *p2)
75 struct index_term *t1 = *(struct index_term **) p1;
76 struct index_term *t2 = *(struct index_term **) p2;
77 int d = strcmp(t1->term, t2->term);
81 if (t1->docid > t2->docid)
83 else if (t1->docid < t2->docid)
85 if (t1->seqno > t2->seqno)
87 else if (t1->seqno < t2->seqno)
93 int code_read(void *vp, char **dst, int *insertMode)
95 struct index_block *b = (struct index_block *)vp;
99 if (b->current_entry >= b->no_entries)
102 t = b->ar[b->current_entry];
106 key.mem[0] = t->word_id;
107 key.mem[1] = t->docid;
108 key.mem[2] = t->seqno;
111 memcpy(*dst, &key, sizeof(key));
113 (*dst) += sizeof(key);
116 yaz_log(YLOG_LOG, "returning " ZINT_FORMAT " " ZINT_FORMAT "\n",
117 key.mem[0], key.mem[1]);
122 void index_block_flush(struct index_block *b, ISAMB isb, Dict dict,
125 struct index_term *t = b->terms;
128 int no_words = 0, no_new_words = 0;
129 const char *dict_info = 0;
131 yaz_timing_t tim_dict = 0;
132 yaz_timing_t tim_isamb = 0;
133 zint number_of_int_splits = isamb_get_int_splits(isb);
134 zint number_of_leaf_splits = isamb_get_leaf_splits(isb);
135 zint number_of_dict_splits = dict_get_no_split(dict);
137 b->ar = xmalloc(sizeof(*b->ar) * b->no_entries);
138 for (i = 0; i < b->no_entries; i++, t = t->next)
145 qsort(b->ar, b->no_entries, sizeof(*b->ar), cmp_ar);
146 tim_dict = yaz_timing_create();
148 for (i = 0; i < b->no_entries; i++)
150 printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n",
151 ar[i]->term, ar[i]->docid, ar[i]->seqno);
154 dict_info = dict_lookup(dict, "_w");
157 assert(*dict_info == sizeof(word_id_seq));
158 memcpy(&word_id_seq, dict_info+1, sizeof(word_id_seq));
161 dict_info = dict_lookup(dict, "_i");
164 assert(*dict_info == sizeof(isamc_p));
165 memcpy(&isamc_p, dict_info+1, sizeof(isamc_p));
168 for (i = 0; i < b->no_entries; i++)
170 if (i > 0 && strcmp(b->ar[i-1]->term, b->ar[i]->term) == 0)
171 b->ar[i]->word_id = b->ar[i-1]->word_id;
174 const char *dict_info = dict_lookup(dict, b->ar[i]->term);
177 memcpy(&b->ar[i]->word_id, dict_info+1, sizeof(int));
183 dict_insert(dict, b->ar[i]->term, sizeof(int), &word_id_seq);
184 b->ar[i]->word_id = word_id_seq;
189 dict_insert(dict, "_w", sizeof(word_id_seq), &word_id_seq);
191 yaz_timing_stop(tim_dict);
192 tim_isamb = yaz_timing_create();
194 b->current_entry = 0;
200 isamc_i.clientData = b;
201 isamc_i.read_item = code_read;
203 isamb_merge (isb, &isamc_p, &isamc_i);
206 dict_insert(dict, "_i", sizeof(isamc_p), &isamc_p);
209 yaz_timing_stop(tim_isamb);
211 number_of_int_splits = isamb_get_int_splits(isb) - number_of_int_splits;
212 number_of_leaf_splits = isamb_get_leaf_splits(isb) - number_of_leaf_splits;
213 number_of_dict_splits = dict_get_no_split(dict) - number_of_dict_splits;
217 printf("# run total dict-real user sys isam-real user sys "
218 " intsp leafsp docs postings words new d-spl\n");
221 printf("%5d %9.6f %9.6f %5.2f %5.2f %9.6f %5.2f %5.2f "
222 "%6" ZINT_FORMAT0 " %6" ZINT_FORMAT0
223 " %8d %8d %6d %6d" " %5" ZINT_FORMAT0 "\n",
225 yaz_timing_get_real(tim_dict) + yaz_timing_get_real(tim_isamb),
226 yaz_timing_get_real(tim_dict),
227 yaz_timing_get_user(tim_dict),
228 yaz_timing_get_sys(tim_dict),
229 yaz_timing_get_real(tim_isamb),
230 yaz_timing_get_user(tim_isamb),
231 yaz_timing_get_sys(tim_isamb),
232 number_of_int_splits,
233 number_of_leaf_splits,
238 number_of_dict_splits
248 yaz_timing_destroy(&tim_isamb);
249 yaz_timing_destroy(&tim_dict);
252 void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict,
255 int total = nmem_total(b->nmem);
256 int max = b->current_max;
259 index_block_flush(b, isb, dict, no_docs);
263 void index_block_add(struct index_block *b,
264 const char *term, zint docid, zint seqno)
266 struct index_term *t = nmem_malloc(b->nmem, sizeof(*t));
267 t->term = nmem_strdup(b->nmem, term);
275 void index_term(struct index_block *b, const char *term,
276 zint docid, zint *seqno)
279 printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n", term,
282 index_block_add(b, term, docid, *seqno);
286 void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid,
290 const char *cp = wrbuf_buf(wrbuf);
301 { /* skip field+indicator (e.g. 245 00) */
302 for (i = 0; i<6 && *cp; i++, cp++)
306 { /* continuation line */
307 for (i = 0; i<4 && *cp; i++, cp++)
316 index_term(b, term, docid, &seqno);
322 else if (*cp == subfield_char && cp[1])
326 index_term(b, term, docid, &seqno);
331 else if (strchr("$*/-;,.:[]\"&(){} ", *cp))
335 index_term(b, term, docid, &seqno);
342 unsigned ch = *(const unsigned char *)cp;
343 if (sz < sizeof(term))
345 term[sz] = tolower(ch);
353 index_term(b, term, docid, &seqno);
356 void index_marc_line_records(ISAMB isb,
362 WRBUF wrbuf = wrbuf_alloc();
366 struct index_block *b = index_block_new(memory);
367 while(fgets(line, sizeof(line)-1, inf))
381 index_block_check_flush(b, isb, dict, no_docs);
388 wrbuf_puts(wrbuf, line);
393 /* index existing buffer (if any) */
394 if (wrbuf_len(wrbuf))
396 index_wrbuf(b, wrbuf, *docid_seq, '*');
399 if (line[0] != ' ' && line[1] != ' ' && line[2] != ' ' &&
402 /* normal field+indicator line */
403 wrbuf_puts(wrbuf, line);
407 if (wrbuf_len(wrbuf))
409 index_wrbuf(b, wrbuf, *docid_seq, '*');
414 index_block_flush(b, isb, dict, no_docs);
415 index_block_destroy(&b);
418 void index_marc_from_file(ISAMB isb,
423 int verbose, int print_offset)
425 yaz_marc_t mt = yaz_marc_create();
426 WRBUF wrbuf = wrbuf_alloc();
427 struct index_block *b = index_block_new(memory);
436 r = fread (buf, 1, 5, inf);
439 if (r && print_offset && verbose)
440 printf ("<!-- Extra %ld bytes at end of file -->\n",
444 while (*buf < '0' || *buf > '9')
447 long off = ftell(inf) - 5;
448 if (verbose || print_offset)
449 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
451 *buf & 0xff, *buf & 0xff,
453 for (i = 0; i<4; i++)
455 r = fread(buf+4, 1, 1, inf);
461 if (verbose || print_offset)
462 printf ("<!-- End of file with data -->\n");
465 len = atoi_n(buf, 5);
466 if (len < 25 || len > 100000)
468 long off = ftell(inf) - 5;
469 printf("Bad Length %ld read at offset %ld (%lx)\n",
470 (long)len, (long) off, (long) off);
474 r = fread (buf + 5, 1, rlen, inf);
477 yaz_marc_read_iso2709(mt, buf, len);
479 if (yaz_marc_write_line(mt, wrbuf))
482 index_wrbuf(b, wrbuf, *docid_seq, '$');
487 index_block_check_flush(b, isb, dict, no_docs);
489 index_block_flush(b, isb, dict, no_docs);
490 wrbuf_destroy(wrbuf);
491 yaz_marc_destroy(mt);
492 index_block_destroy(&b);
495 void exit_usage(void)
497 fprintf(stderr, "benchindex1 [-t type] [-c d:i] [-m mem] [-i] [inputfile]\n");
501 int main(int argc, char **argv)
505 ISAMC_M method_postings;
511 int isam_cache_size = 40;
512 int dict_cache_size = 50;
513 const char *fname = 0;
515 yaz_timing_t tim = 0;
517 const char *dict_info;
518 const char *type = "iso2709";
519 int int_count_enable = 1;
521 while ((ret = options("im:t:c:N", argv, argc, &arg)) != -2)
532 if (!strcmp(arg, "iso2709"))
534 else if (!strcmp(arg, "line"))
538 fprintf(stderr, "bad type: %s.\n", arg);
543 if (sscanf(arg, "%d:%d", &dict_cache_size, &isam_cache_size)
546 fprintf(stderr, "bad cache sizes for -c\n");
554 int_count_enable = 0;
557 fprintf(stderr, "bad option.\n");
564 inf = fopen(fname, "rb");
567 fprintf(stderr, "Cannot open %s\n", fname);
571 printf("# benchindex1 %s %s\n", __DATE__, __TIME__);
572 printf("# isam_cache_size = %d\n", isam_cache_size);
573 printf("# dict_cache_size = %d\n", dict_cache_size);
574 printf("# int_count_enable = %d\n", int_count_enable);
575 printf("# memory = %d\n", memory);
577 /* setup postings isamb attributes */
578 method_postings.compare_item = key_compare;
579 method_postings.log_item = key_logdump_txt;
581 method_postings.codec.start = iscz1_start;
582 method_postings.codec.decode = iscz1_decode;
583 method_postings.codec.encode = iscz1_encode;
584 method_postings.codec.stop = iscz1_stop;
585 method_postings.codec.reset = iscz1_reset;
587 method_postings.debug = 0;
589 /* create block system */
590 bfs = bfs_create(0, 0);
593 yaz_log(YLOG_WARN, "bfs_create failed");
600 tim = yaz_timing_create();
601 /* create isam handle */
602 isb_postings = isamb_open (bfs, "isamb", isam_cache_size ? 1 : 0,
603 &method_postings, 0);
606 yaz_log(YLOG_WARN, "isamb_open failed");
609 isamb_set_cache_size(isb_postings, isam_cache_size);
610 isamb_set_int_count(isb_postings, int_count_enable);
611 dict = dict_open(bfs, "dict", dict_cache_size, 1, 0, 4096);
613 dict_info = dict_lookup(dict, "_s");
616 assert(*dict_info == sizeof(docid_seq));
617 memcpy(&docid_seq, dict_info+1, sizeof(docid_seq));
620 if (!strcmp(type, "iso2709"))
621 index_marc_from_file(isb_postings, dict, &docid_seq, inf, memory,
622 0 /* verbose */ , 0 /* print_offset */);
623 else if (!strcmp(type, "line"))
624 index_marc_line_records(isb_postings, dict, &docid_seq, inf, memory);
626 printf("# Total " ZINT_FORMAT " documents\n", docid_seq);
627 dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq);
630 isamb_close(isb_postings);
634 /* exit block system */
636 yaz_timing_stop(tim);
638 printf("# Total timings real=%8.6f user=%3.2f system=%3.2f\n",
639 yaz_timing_get_real(tim),
640 yaz_timing_get_user(tim),
641 yaz_timing_get_sys(tim));
643 yaz_timing_destroy(&tim);
651 * c-file-style: "Stroustrup"
652 * indent-tabs-mode: nil
654 * vim: shiftwidth=4 tabstop=8 expandtab