/*
- * Copyright (C) 1994-1999, Index Data
+ * Copyright (C) 1994-2001, Index Data
* All rights reserved.
* Sebastian Hammer, Adam Dickmeiss
*
* $Log: extract.c,v $
- * Revision 1.97 1999-07-06 12:28:04 adam
+ * Revision 1.107 2001-05-28 13:58:48 adam
+ * Call flushSortKeys when record is skipped to fix bad re-use of
+ * sort keys to whatever next record that comes in.
+ *
+ * Revision 1.106 2000/12/05 12:22:53 adam
+ * Termlist source implemented (so that we can index values of XML/SGML
+ * attributes).
+ *
+ * Revision 1.105 2000/12/05 10:01:44 adam
+ * Fixed bug regarding user-defined attribute sets.
+ *
+ * Revision 1.104 2000/09/05 14:04:05 adam
+ * Updates for prefix 'yaz_' for YAZ log functions.
+ *
+ * Revision 1.103 2000/05/18 12:01:36 adam
+ * System call times(2) used again. More 64-bit fixes.
+ *
+ * Revision 1.102 2000/05/15 15:32:33 adam
+ * Added 64 bit file input.
+ *
+ * Revision 1.101 2000/05/15 13:02:39 adam
+ * Minor change.
+ *
+ * Revision 1.100 2000/03/20 19:08:36 adam
+ * Added remote record import using Z39.50 extended services and Segment
+ * Requests.
+ *
+ * Revision 1.99 2000/02/24 10:57:02 adam
+ * Sequence number incremented after each incomplete-field.
+ *
+ * Revision 1.98 1999/09/07 07:19:21 adam
+ * Work on character mapping. Implemented replace rules.
+ *
+ * Revision 1.97 1999/07/06 12:28:04 adam
* Updated record index structure. Format includes version ID. Compression
* algorithm ID is stored for each record block.
*
#include "zinfo.h"
+#if _FILE_OFFSET_BITS == 64
+#define PRINTF_OFF_T "%Ld"
+#else
+#define PRINTF_OFF_T "%ld"
+#endif
+
#ifndef ZEBRASDR
#define ZEBRASDR 0
#endif
static ZebraExplainInfo zti = NULL;
+
static void logRecord (int showFlag)
{
if (!showFlag)
char *recordCompression;
int record_compression = REC_COMPRESS_NONE;
if (!mem)
- mem = atoi(res_get_def (common_resource, "memMax", "4"))*1024*1024;
+ mem = atoi(res_get_def (common_resource, "memMax", "16"))*1024*1024;
if (mem < 50000)
mem = 50000;
key_buf = (char **) xmalloc (mem);
logf (LOG_LOG, "sorting section %d", key_file_no);
#if !SORT_EXTRA
qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_qsort_compare);
- getFnameTmp (out_fname, key_file_no);
+ getFnameTmp (common_resource, out_fname, key_file_no);
if (!(outf = fopen (out_fname, "wb")))
{
int rw = rGroup->flagRw;
if (rw)
zebraExplain_runNumberIncrement (zti, 1);
- zebraExplain_close (zti, rw, 0);
+ zebraExplain_close (zti, rw);
key_flush ();
xfree (key_buf);
rec_close (&records);
w->attrSet = VAL_BIB1;
w->attrUse = 1016;
w->reg_type = 'w';
+ w->extractCtrl = p;
}
static struct sortKey {
*dst++ = lead;
+#if SU_SCHEME
+ if ((lead & 3) < 3)
+ {
+ int ch = zebraExplain_lookupSU (zti, attrSet, attrUse);
+ if (ch < 0)
+ {
+ ch = zebraExplain_addSU (zti, attrSet, attrUse);
+ }
+ assert (ch > 0);
+ memcpy (dst, &ch, sizeof(ch));
+ dst += sizeof(ch);
+ }
+#else
if (!(lead & 1))
{
memcpy (dst, &attrSet, sizeof(attrSet));
memcpy (dst, &attrUse, sizeof(attrUse));
dst += sizeof(attrUse);
}
+#endif
*dst++ = p->reg_type;
memcpy (dst, string, length);
dst += length;
return;
addString (p, buf, i);
}
+ (p->seqnos[p->reg_type])++; /* to separate this from next one */
}
static void addCompleteField (RecWord *p)
static void addRecordKey (RecWord *p)
{
+ WRBUF wrbuf;
+ if ((wrbuf = zebra_replace(p->zebra_maps, p->reg_type, 0,
+ p->string, p->length)))
+ {
+ p->string = wrbuf_buf(wrbuf);
+ p->length = wrbuf_len(wrbuf);
+ }
if (zebra_maps_is_complete (p->zebra_maps, p->reg_type))
addCompleteField (p);
else
while (sk)
{
struct sortKey *sk_next = sk->next;
- sortIdx_type (sortIdx, sk->attrUse);
- sortIdx_add (sortIdx, sk->string, sk->length);
+ if (cmd >= 0)
+ {
+ sortIdx_type (sortIdx, sk->attrUse);
+ sortIdx_add (sortIdx, sk->string, sk->length);
+ }
xfree (sk->string);
xfree (sk);
sk = sk_next;
static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys)
{
+#if SU_SCHEME
+#else
unsigned char attrSet = (unsigned char) -1;
unsigned short attrUse = (unsigned short) -1;
+#endif
int seqno = 0;
int off = 0;
+ int ch = 0;
zebraExplain_recordCountIncrement (zti, cmd ? 1 : -1);
while (off < reckeys->buf_used)
{
const char *src = reckeys->buf + off;
struct it_key key;
- int lead, ch;
+ int lead;
lead = *src++;
+#if SU_SCHEME
+ if ((lead & 3) < 3)
+ {
+ memcpy (&ch, src, sizeof(ch));
+ src += sizeof(ch);
+ }
+#else
if (!(lead & 1))
{
memcpy (&attrSet, src, sizeof(attrSet));
memcpy (&attrUse, src, sizeof(attrUse));
src += sizeof(attrUse);
}
+#endif
if (key_buf_used + 1024 > (ptr_top-ptr_i)*sizeof(char*))
key_flush ();
++ptr_i;
+
key_buf[ptr_top-ptr_i] = (char*)key_buf + key_buf_used;
+#if SU_SCHEME
+#else
ch = zebraExplain_lookupSU (zti, attrSet, attrUse);
if (ch < 0)
+ {
ch = zebraExplain_addSU (zti, attrSet, attrUse);
+ yaz_log (LOG_LOG, "addSU cmd=%d set=%d use=%d SU=%d",
+ cmd, attrSet, attrUse, ch);
+ }
+#endif
assert (ch > 0);
key_buf_used += key_SU_code (ch, ((char*)key_buf) + key_buf_used);
int off = 0;
int startSeq = -1;
int i;
+ int seqno = 0;
+#if SU_SCHEME
+ int chS, ch;
+#else
short attrUse;
char attrSet;
- int seqno = 0;
+#endif
for (i = 0; i<32; i++)
ws[i] = NULL;
-
+
+#if SU_SCHEME
+ chS = zebraExplain_lookupSU (zti, attrSetS, attrUseS);
+ if (chS < 0)
+ return ws;
+#endif
while (off < reckeys->buf_used)
{
int lead;
lead = *src++;
-
+#if SU_SCHEME
+ if ((lead & 3)<3)
+ {
+ memcpy (&ch, src, sizeof(ch));
+ src += sizeof(ch);
+ }
+#else
if (!(lead & 1))
{
memcpy (&attrSet, src, sizeof(attrSet));
memcpy (&attrUse, src, sizeof(attrUse));
src += sizeof(attrUse);
}
+#endif
wstart = src;
while (*src++)
;
memcpy (&seqno, src, sizeof(seqno));
src += sizeof(seqno);
}
- if (attrUseS == attrUse && attrSetS == attrSet)
+ if (
+#if SU_SCHEME
+ ch == chS
+#else
+ attrUseS == attrUse && attrSetS == attrSet
+#endif
+ )
{
int woff;
static void recordLogPreamble (int level, const char *msg, void *info)
{
struct recordLogInfo *p = (struct recordLogInfo *) info;
- FILE *outf = log_file ();
+ FILE *outf = yaz_log_file ();
if (level & LOG_LOG)
return ;
extractCtrl.fh = fi;
extractCtrl.subType = subType;
extractCtrl.init = wordInit;
- extractCtrl.addWord = addRecordKey;
- extractCtrl.addSchema = addSchema;
+ extractCtrl.tokenAdd = addRecordKey;
+ extractCtrl.schemaAdd = addSchema;
extractCtrl.dh = rGroup->dh;
for (i = 0; i<256; i++)
{
extractCtrl.flagShowRecords = !rGroup->flagRw;
if (!rGroup->flagRw)
- printf ("File: %s %ld\n", fname, (long) recordOffset);
+ printf ("File: %s " PRINTF_OFF_T "\n", fname, recordOffset);
logInfo.fname = fname;
logInfo.recordOffset = recordOffset;
if (rGroup->flagRw &&
records_processed < rGroup->fileVerboseLimit)
{
- logf (LOG_WARN, "fail %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_WARN, "fail %s %s " PRINTF_OFF_T, rGroup->recordType,
+ fname, recordOffset);
}
return 0;
}
is probably empty - unless flagShowRecords is in use */
if (!rGroup->flagRw)
return 1;
- logf (LOG_WARN, "No keys generated for file %s", fname);
- logf (LOG_WARN, " The file is probably empty");
+
+ logf (LOG_WARN, "empty %s %s " PRINTF_OFF_T, rGroup->recordType,
+ fname, recordOffset);
return 1;
}
}
/* new record */
if (deleteFlag)
{
- logf (LOG_LOG, "delete %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T, rGroup->recordType,
+ fname, recordOffset);
logf (LOG_WARN, "cannot delete record above (seems new)");
return 1;
}
if (records_processed < rGroup->fileVerboseLimit)
- logf (LOG_LOG, "add %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "add %s %s " PRINTF_OFF_T, rGroup->recordType,
+ fname, recordOffset);
rec = rec_new (records);
*sysno = rec->sysno;
if (recordAttr->runNumber == zebraExplain_runNumberIncrement (zti, 0))
{
- logf (LOG_LOG, "skipped %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "skipped %s %s " PRINTF_OFF_T, rGroup->recordType,
+ fname, recordOffset);
+ flushSortKeys (*sysno, -1);
rec_rm (&rec);
logRecord (0);
return 1;
/* record going to be deleted */
if (!delkeys.buf_used)
{
- logf (LOG_LOG, "delete %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,
+ rGroup->recordType, fname, recordOffset);
logf (LOG_WARN, "cannot delete file above, storeKeys false");
}
else
{
if (records_processed < rGroup->fileVerboseLimit)
- logf (LOG_LOG, "delete %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,
+ rGroup->recordType, fname, recordOffset);
records_deleted++;
if (matchStr)
dict_delete (matchDict, matchStr);
/* record going to be updated */
if (!delkeys.buf_used)
{
- logf (LOG_LOG, "update %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,
+ rGroup->recordType, fname, recordOffset);
logf (LOG_WARN, "cannot update file above, storeKeys false");
}
else
{
if (records_processed < rGroup->fileVerboseLimit)
- logf (LOG_LOG, "update %s %s %ld", rGroup->recordType,
- fname, (long) recordOffset);
+ logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,
+ rGroup->recordType, fname, recordOffset);
flushRecordKeys (*sysno, 1, &reckeys);
records_updated++;
}
xmalloc (recordAttr->recordSize);
if (lseek (fi->fd, recordOffset, SEEK_SET) < 0)
{
- logf (LOG_ERRNO|LOG_FATAL, "seek to %ld in %s",
- (long) recordOffset, fname);
+ logf (LOG_ERRNO|LOG_FATAL, "seek to " PRINTF_OFF_T " in %s",
+ recordOffset, fname);
exit (1);
}
if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize)
{
if (zebraExplain_newDatabase (zti, rGroup->databaseName,
rGroup->explainDatabase))
- abort ();
+ return 0;
}
if (rGroup->flagStoreData == -1)
if (rGroup->flagStoreKeys == -1)
rGroup->flagStoreKeys = 0;
-#if ZEBRASDR
- if (rGroup->useSDR)
- {
- ZebraSdrHandle h;
- char xname[128], *xp;
-
- strncpy (xname, fname, 127);
- if (!(xp = strchr (xname, '.')))
- return 0;
- *xp = '\0';
- if (strcmp (xp+1, "sdr.bits"))
- return 0;
-
- h = zebraSdr_open (xname);
- if (!h)
- {
- logf (LOG_WARN, "sdr open %s", xname);
- return 0;
- }
- for (;;)
- {
- unsigned char *buf;
- char sdr_name[128];
- int r, segmentno;
-
- segmentno = zebraSdr_segment (h, 0);
- sprintf (sdr_name, "%%%s.%d", xname, segmentno);
-
-#if 0
- if (segmentno > 20)
- break;
-#endif
- r = zebraSdr_read (h, &buf);
-
- if (!r)
- break;
-
- fi = file_read_start (0);
- fi->sdrbuf = buf;
- fi->sdrmax = r;
- do
- {
- file_begin (fi);
- r = recordExtract (sysno, sdr_name, rGroup, deleteFlag, fi,
- recType, subType);
- } while (r && !sysno && fi->file_more);
- file_read_stop (fi);
- free (buf);
- }
- zebraSdr_close (h);
- return 1;
- }
-#endif
if (sysno && deleteFlag)
fd = -1;
else
reckeys.prevSeqNo = 0;
extractCtrl.init = wordInit;
- extractCtrl.addWord = addRecordKey;
- extractCtrl.addSchema = addSchema;
+ extractCtrl.tokenAdd = addRecordKey;
+ extractCtrl.schemaAdd = addSchema;
extractCtrl.dh = rGroup->dh;
for (i = 0; i<256; i++)
extractCtrl.seqno[i] = 0;