1 /* $Id: regxread.c,v 1.60 2006-04-26 11:12:31 adam Exp $
2 Copyright (C) 1995-2005
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
29 #include <yaz/tpath.h>
30 #include <idzebra/util.h>
32 #include <idzebra/recgrs.h>
37 #if MAJOR_VERSION >= 8
38 #define HAVE_TCL_OBJECTS
44 #define F_WIN_EOF 2000000000
48 #define REGX_PATTERN 1
53 #define REGX_CONTEXT 6
63 struct lexRuleAction {
67 struct DFA *dfa; /* REGX_PATTERN */
70 struct regxCode *code; /* REGX_CODE */
72 struct lexRuleAction *next;
77 struct lexRuleAction *actionList;
81 struct lexRuleInfo info;
88 struct lexRule *rules;
89 struct lexRuleInfo **fastRule;
93 struct lexRuleAction *beginActionList;
94 struct lexRuleAction *endActionList;
95 struct lexRuleAction *initActionList;
96 struct lexContext *next;
106 struct lexContext *context;
108 struct lexContext **context_stack;
109 int context_stack_size;
110 int context_stack_top;
116 Tcl_Interp *tcl_interp;
119 void (*f_win_ef)(void *, off_t);
121 int f_win_start; /* first byte of buffer is this file offset */
122 int f_win_end; /* last byte of buffer is this offset - 1 */
123 int f_win_size; /* size of buffer */
124 char *f_win_buf; /* buffer itself */
125 int (*f_win_rf)(void *, char *, size_t);
126 off_t (*f_win_sf)(void *, off_t);
128 struct lexConcatBuf *concatBuf;
130 data1_node **d1_stack;
141 struct lexSpec *spec;
145 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
148 int i, r, off = start_pos - spec->f_win_start;
150 if (off >= 0 && end_pos <= spec->f_win_end)
152 *size = end_pos - start_pos;
153 return spec->f_win_buf + off;
155 if (off < 0 || start_pos >= spec->f_win_end)
157 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
158 spec->f_win_start = start_pos;
160 if (!spec->f_win_buf)
161 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
162 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
164 spec->f_win_end = spec->f_win_start + *size;
166 if (*size > end_pos - start_pos)
167 *size = end_pos - start_pos;
168 return spec->f_win_buf;
170 for (i = 0; i<spec->f_win_end - start_pos; i++)
171 spec->f_win_buf[i] = spec->f_win_buf[i + off];
172 r = (*spec->f_win_rf)(spec->f_win_fh,
174 spec->f_win_size - i);
175 spec->f_win_start = start_pos;
176 spec->f_win_end += r;
178 if (*size > end_pos - start_pos)
179 *size = end_pos - start_pos;
180 return spec->f_win_buf;
183 static int f_win_advance (struct lexSpec *spec, int *pos)
188 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
189 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
190 if (*pos == F_WIN_EOF)
192 buf = f_win_get (spec, *pos, *pos+1, &size);
202 static void regxCodeDel (struct regxCode **pp)
204 struct regxCode *p = *pp;
209 Tcl_DecrRefCount (p->tcl_obj);
217 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
221 p = (struct regxCode *) xmalloc (sizeof(*p));
222 p->str = (char *) xmalloc (len+1);
223 memcpy (p->str, buf, len);
226 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
228 Tcl_IncrRefCount (p->tcl_obj);
233 static struct DFA *lexSpecDFA (void)
238 dfa_parse_cmap_del (dfa, ' ');
239 dfa_parse_cmap_del (dfa, '\t');
240 dfa_parse_cmap_add (dfa, '/', 0);
244 static void actionListDel (struct lexRuleAction **rap)
246 struct lexRuleAction *ra1, *ra;
248 for (ra = *rap; ra; ra = ra1)
254 dfa_delete (&ra->u.pattern.dfa);
257 regxCodeDel (&ra->u.code);
265 static struct lexContext *lexContextCreate (const char *name)
267 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
269 p->name = xstrdup (name);
272 p->dfa = lexSpecDFA ();
275 p->beginActionList = NULL;
276 p->endActionList = NULL;
277 p->initActionList = NULL;
282 static void lexContextDestroy (struct lexContext *p)
284 struct lexRule *rp, *rp1;
286 dfa_delete (&p->dfa);
288 for (rp = p->rules; rp; rp = rp1)
291 actionListDel (&rp->info.actionList);
294 actionListDel (&p->beginActionList);
295 actionListDel (&p->endActionList);
296 actionListDel (&p->initActionList);
301 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
306 p = (struct lexSpec *) xmalloc (sizeof(*p));
307 p->name = (char *) xmalloc (strlen(name)+1);
308 strcpy (p->name, name);
315 p->context_stack_size = 100;
316 p->context_stack = (struct lexContext **)
317 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
321 p->concatBuf = (struct lexConcatBuf *)
322 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
323 for (i = 0; i < p->maxLevel; i++)
325 p->concatBuf[i].max = 0;
326 p->concatBuf[i].buf = 0;
328 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
333 static void lexSpecDestroy (struct lexSpec **pp)
336 struct lexContext *lt;
344 for (i = 0; i < p->maxLevel; i++)
345 xfree (p->concatBuf[i].buf);
346 xfree (p->concatBuf);
351 struct lexContext *lt_next = lt->next;
352 lexContextDestroy (lt);
357 Tcl_DeleteInterp (p->tcl_interp);
360 xfree (p->f_win_buf);
361 xfree (p->context_stack);
367 static int readParseToken (const char **cpp, int *len)
369 const char *cp = *cpp;
373 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
402 if (*cp >= 'a' && *cp <= 'z')
404 else if (*cp >= 'A' && *cp <= 'Z')
405 cmd[i] = *cp + 'a' - 'A';
408 if (i < (int) sizeof(cmd)-2)
415 yaz_log (YLOG_WARN, "bad character %d %c", *cp, *cp);
417 while (*cp && *cp != ' ' && *cp != '\t' &&
418 *cp != '\n' && *cp != '\r')
424 if (!strcmp (cmd, "begin"))
426 else if (!strcmp (cmd, "end"))
428 else if (!strcmp (cmd, "body"))
430 else if (!strcmp (cmd, "context"))
432 else if (!strcmp (cmd, "init"))
436 yaz_log (YLOG_WARN, "bad command %s", cmd);
442 static int actionListMk (struct lexSpec *spec, const char *s,
443 struct lexRuleAction **ap)
449 while ((tok = readParseToken (&s, &len)))
457 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
459 regxCodeMk (&(*ap)->u.code, s, len);
463 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
465 (*ap)->u.pattern.body = bodyMark;
467 (*ap)->u.pattern.dfa = lexSpecDFA ();
469 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
474 yaz_log (YLOG_WARN, "regular expression error '%.*s'", s-s0, s0);
478 printf ("pattern: %.*s\n", s-s0, s0);
479 dfa_mkstate ((*ap)->u.pattern.dfa);
483 yaz_log (YLOG_WARN, "cannot use BEGIN here");
486 yaz_log (YLOG_WARN, "cannot use INIT here");
489 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
499 int readOneSpec (struct lexSpec *spec, const char *s)
503 struct lexContext *lc;
505 tok = readParseToken (&s, &len);
506 if (tok == REGX_CONTEXT)
508 char context_name[32];
509 tok = readParseToken (&s, &len);
510 if (tok != REGX_CODE)
512 yaz_log (YLOG_WARN, "missing name after CONTEXT keyword");
517 memcpy (context_name, s, len);
518 context_name[len] = '\0';
519 lc = lexContextCreate (context_name);
520 lc->next = spec->context;
525 spec->context = lexContextCreate ("main");
530 actionListDel (&spec->context->beginActionList);
531 actionListMk (spec, s, &spec->context->beginActionList);
534 actionListDel (&spec->context->endActionList);
535 actionListMk (spec, s, &spec->context->endActionList);
538 actionListDel (&spec->context->initActionList);
539 actionListMk (spec, s, &spec->context->initActionList);
543 yaz_log (YLOG_LOG, "rule %d %s", spec->context->ruleNo, s);
545 r = dfa_parse (spec->context->dfa, &s);
548 yaz_log (YLOG_WARN, "regular expression error. r=%d", r);
553 yaz_log (YLOG_WARN, "expects / at end of pattern. got %c", *s);
557 rp = (struct lexRule *) xmalloc (sizeof(*rp));
558 rp->info.no = spec->context->ruleNo++;
559 rp->next = spec->context->rules;
560 spec->context->rules = rp;
561 actionListMk (spec, s, &rp->info.actionList);
566 int readFileSpec (struct lexSpec *spec)
568 struct lexContext *lc;
569 int c, i, errors = 0;
575 if (spec->tcl_interp)
577 sprintf (fname, "%s.tflt", spec->name);
578 spec_inf = data1_path_fopen (spec->dh, fname, "r");
583 sprintf (fname, "%s.flt", spec->name);
584 spec_inf = data1_path_fopen (spec->dh, fname, "r");
588 yaz_log (YLOG_ERRNO|YLOG_WARN, "cannot read spec file %s", spec->name);
591 yaz_log (YLOG_LOG, "reading regx filter %s", fname);
593 if (spec->tcl_interp)
594 yaz_log (YLOG_LOG, "Tcl enabled");
600 debug_dfa_followpos = 0;
604 lineBuf = wrbuf_alloc();
609 wrbuf_rewind (lineBuf);
610 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
612 while (c != '\n' && c != EOF)
625 wrbuf_putc(lineBuf, c);
633 if (c != ' ' && c != '\t')
638 wrbuf_putc(lineBuf, '\0');
639 readOneSpec (spec, wrbuf_buf(lineBuf));
640 spec->lineNo += addLine;
644 wrbuf_free(lineBuf, 1);
646 for (lc = spec->context; lc; lc = lc->next)
649 lc->fastRule = (struct lexRuleInfo **)
650 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
651 for (i = 0; i < lc->ruleNo; i++)
652 lc->fastRule[i] = NULL;
653 for (rp = lc->rules; rp; rp = rp->next)
654 lc->fastRule[rp->info.no] = &rp->info;
655 dfa_mkstate (lc->dfa);
664 static struct lexSpec *curLexSpec = NULL;
667 static void execData (struct lexSpec *spec,
668 const char *ebuf, int elen, int formatted_text,
669 const char *attribute_str, int attribute_len)
671 struct data1_node *res, *parent;
674 if (elen == 0) /* shouldn't happen, but it does! */
678 yaz_log (YLOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
679 ebuf, 40, ebuf + elen-40);
680 else if (elen == 1 && ebuf[0] == '\n')
682 yaz_log (YLOG_LOG, "data(new line)");
685 yaz_log (YLOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
687 yaz_log (YLOG_LOG, "data(%d bytes)", elen);
690 if (spec->d1_level <= 1)
693 parent = spec->d1_stack[spec->d1_level -1];
700 if (res->which != DATA1N_tag)
702 /* sweep through exising attributes.. */
703 for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
704 if (strlen((*ap)->name) == attribute_len &&
705 !memcmp((*ap)->name, attribute_str, attribute_len))
709 /* new attribute. Create it with name + value */
710 *ap = nmem_malloc(spec->m, sizeof(**ap));
712 (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
713 memcpy((*ap)->name, attribute_str, attribute_len);
714 (*ap)->name[attribute_len] = '\0';
716 (*ap)->value = nmem_malloc(spec->m, elen+1);
717 memcpy((*ap)->value, ebuf, elen);
718 (*ap)->value[elen] = '\0';
723 /* append to value if attribute already exists */
724 char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
725 strcpy(nv, (*ap)->value);
726 memcpy (nv + strlen(nv), ebuf, elen);
727 nv[strlen(nv)+elen] = '\0';
733 if ((res = spec->d1_stack[spec->d1_level]) &&
734 res->which == DATA1N_data)
735 org_len = res->u.data.len;
740 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
741 res->u.data.what = DATA1I_text;
743 res->u.data.formatted_text = formatted_text;
744 res->u.data.data = 0;
746 if (spec->d1_stack[spec->d1_level])
747 spec->d1_stack[spec->d1_level]->next = res;
748 spec->d1_stack[spec->d1_level] = res;
750 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
752 char *old_buf, *new_buf;
754 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
755 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
756 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
758 memcpy (new_buf, old_buf, org_len);
761 spec->concatBuf[spec->d1_level].buf = new_buf;
763 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
764 res->u.data.len += elen;
768 static void execDataP (struct lexSpec *spec,
769 const char *ebuf, int elen, int formatted_text)
771 execData (spec, ebuf, elen, formatted_text, 0, 0);
774 static void tagDataRelease (struct lexSpec *spec)
778 if ((res = spec->d1_stack[spec->d1_level]) &&
779 res->which == DATA1N_data &&
780 res->u.data.what == DATA1I_text)
782 assert (!res->u.data.data);
783 assert (res->u.data.len > 0);
784 if (res->u.data.len > DATA1_LOCALDATA)
785 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
787 res->u.data.data = res->lbuf;
788 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
793 static void variantBegin (struct lexSpec *spec,
794 const char *class_str, int class_len,
795 const char *type_str, int type_len,
796 const char *value_str, int value_len)
798 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
799 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
804 if (spec->d1_level == 0)
806 yaz_log (YLOG_WARN, "in variant begin. No record type defined");
809 if (class_len >= DATA1_MAX_SYMBOL)
810 class_len = DATA1_MAX_SYMBOL-1;
811 memcpy (tclass, class_str, class_len);
812 tclass[class_len] = '\0';
814 if (type_len >= DATA1_MAX_SYMBOL)
815 type_len = DATA1_MAX_SYMBOL-1;
816 memcpy (ttype, type_str, type_len);
817 ttype[type_len] = '\0';
820 yaz_log (YLOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
825 data1_getvartypeby_absyn(spec->dh, parent->root->u.root.absyn,
829 if (parent->which != DATA1N_variant)
831 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
832 if (spec->d1_stack[spec->d1_level])
833 tagDataRelease (spec);
834 spec->d1_stack[spec->d1_level] = res;
835 spec->d1_stack[++(spec->d1_level)] = NULL;
837 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
838 if (spec->d1_stack[i]->u.variant.type == tp)
845 yaz_log (YLOG_LOG, "variant node(%d)", spec->d1_level);
847 parent = spec->d1_stack[spec->d1_level-1];
848 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
849 res->u.variant.type = tp;
851 if (value_len >= DATA1_LOCALDATA)
852 value_len =DATA1_LOCALDATA-1;
853 memcpy (res->lbuf, value_str, value_len);
854 res->lbuf[value_len] = '\0';
856 res->u.variant.value = res->lbuf;
858 if (spec->d1_stack[spec->d1_level])
859 tagDataRelease (spec);
860 spec->d1_stack[spec->d1_level] = res;
861 spec->d1_stack[++(spec->d1_level)] = NULL;
864 static void tagStrip (const char **tag, int *len)
868 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
871 for (i = 0; i < *len && isspace((*tag)[i]); i++)
877 static void tagBegin (struct lexSpec *spec,
878 const char *tag, int len)
880 if (spec->d1_level == 0)
882 yaz_log (YLOG_WARN, "in element begin. No record type defined");
885 tagStrip (&tag, &len);
886 if (spec->d1_stack[spec->d1_level])
887 tagDataRelease (spec);
890 yaz_log (YLOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
893 spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
894 spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
895 spec->d1_stack[++(spec->d1_level)] = NULL;
898 static void tagEnd (struct lexSpec *spec, int min_level,
899 const char *tag, int len)
901 tagStrip (&tag, &len);
902 while (spec->d1_level > min_level)
904 tagDataRelease (spec);
906 if (spec->d1_level == 0)
908 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
910 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
912 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
916 yaz_log (YLOG_LOG, "end tag(%d)", spec->d1_level);
921 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
922 struct DFA *dfa, int greedy)
924 struct DFA_state *state = dfa->states[0];
927 unsigned char c_prev = 0;
928 int ptr = *pptr; /* current pointer */
929 int start_ptr = *pptr; /* first char of match */
930 int last_ptr = 0; /* last char of match */
931 int last_rule = 0; /* rule number of current match */
938 c = f_win_advance (spec, &ptr);
942 if (dfa->states[0] == state)
947 c = f_win_advance (spec, &ptr);
949 if (ptr == F_WIN_EOF)
963 if (--i < 0) /* no transition for character c */
967 *mptr = start_ptr; /* match starts here */
968 *pptr = last_ptr; /* match end here (+1) */
971 state = dfa->states[0];
974 c = f_win_advance (spec, &ptr);
980 else if (c >= t->ch[0] && c <= t->ch[1])
982 state = dfa->states[t->to];
983 if (state->rule_no && c_prev == '\n')
985 last_rule = state->rule_no;
988 else if (state->rule_nno)
990 last_rule = state->rule_nno;
1001 static int execTok (struct lexSpec *spec, const char **src,
1002 const char **tokBuf, int *tokLen)
1004 const char *s = *src;
1006 while (*s == ' ' || *s == '\t')
1010 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1014 while (*s >= '0' && *s <= '9')
1015 n = n*10 + (*s++ -'0');
1016 if (spec->arg_no == 0)
1023 if (n >= spec->arg_no)
1025 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1029 else if (*s == '\"')
1032 while (*s && *s != '\"')
1034 *tokLen = s - *tokBuf;
1039 else if (*s == '\n' || *s == ';')
1047 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1050 *tokLen = s - *tokBuf;
1057 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1060 *tokLen = s - *tokBuf;
1066 static char *regxStrz (const char *src, int len, char *str)
1070 memcpy (str, src, len);
1076 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1077 int argc, const char **argv)
1079 struct lexSpec *spec = (struct lexSpec *) clientData;
1082 if (!strcmp(argv[1], "record") && argc == 3)
1084 const char *absynName = argv[2];
1088 yaz_log (YLOG_LOG, "begin record %s", absynName);
1090 res = data1_mk_root (spec->dh, spec->m, absynName);
1094 spec->d1_stack[spec->d1_level++] = res;
1096 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1098 spec->d1_stack[spec->d1_level++] = res;
1100 spec->d1_stack[spec->d1_level] = NULL;
1102 else if (!strcmp(argv[1], "element") && argc == 3)
1104 tagBegin (spec, argv[2], strlen(argv[2]));
1106 else if (!strcmp (argv[1], "variant") && argc == 5)
1108 variantBegin (spec, argv[2], strlen(argv[2]),
1109 argv[3], strlen(argv[3]),
1110 argv[4], strlen(argv[4]));
1112 else if (!strcmp (argv[1], "context") && argc == 3)
1114 struct lexContext *lc = spec->context;
1116 yaz_log (YLOG_LOG, "begin context %s",argv[2]);
1118 while (lc && strcmp (argv[2], lc->name))
1122 spec->context_stack[++(spec->context_stack_top)] = lc;
1125 yaz_log (YLOG_WARN, "unknown context %s", argv[2]);
1132 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1133 int argc, const char **argv)
1135 struct lexSpec *spec = (struct lexSpec *) clientData;
1139 if (!strcmp (argv[1], "record"))
1141 while (spec->d1_level)
1143 tagDataRelease (spec);
1147 yaz_log (YLOG_LOG, "end record");
1149 spec->stop_flag = 1;
1151 else if (!strcmp (argv[1], "element"))
1154 const char *element = 0;
1155 if (argc >= 3 && !strcmp(argv[2], "-record"))
1164 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1165 if (spec->d1_level <= 1)
1168 yaz_log (YLOG_LOG, "end element end records");
1170 spec->stop_flag = 1;
1173 else if (!strcmp (argv[1], "context"))
1176 yaz_log (YLOG_LOG, "end context");
1178 if (spec->context_stack_top)
1179 (spec->context_stack_top)--;
1186 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1187 int argc, const char **argv)
1191 const char *element = 0;
1192 const char *attribute = 0;
1193 struct lexSpec *spec = (struct lexSpec *) clientData;
1197 if (!strcmp("-text", argv[argi]))
1202 else if (!strcmp("-element", argv[argi]))
1206 element = argv[argi++];
1208 else if (!strcmp("-attribute", argv[argi]))
1212 attribute = argv[argi++];
1218 tagBegin (spec, element, strlen(element));
1222 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1224 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1225 execData (spec, native, strlen(native), textFlag, attribute,
1226 attribute ? strlen(attribute) : 0);
1227 Tcl_DStringFree (&ds);
1229 execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
1230 attribute ? strlen(attribute) : 0);
1235 tagEnd (spec, 2, NULL, 0);
1239 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1240 int argc, const char **argv)
1242 struct lexSpec *spec = (struct lexSpec *) clientData;
1249 if (!strcmp("-offset", argv[argi]))
1254 offset = atoi(argv[argi]);
1263 no = atoi(argv[argi]);
1264 if (no >= spec->arg_no)
1265 no = spec->arg_no - 1;
1266 spec->ptr = spec->arg_start[no] + offset;
1270 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1274 for (i = 0; i < spec->arg_no; i++)
1276 char var_name[10], *var_buf;
1279 sprintf (var_name, "%d", i);
1280 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1284 ch = var_buf[var_len];
1285 var_buf[var_len] = '\0';
1286 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1287 var_buf[var_len] = ch;
1290 #if HAVE_TCL_OBJECTS
1291 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1293 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1297 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1298 yaz_log(YLOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1299 spec->tcl_interp->errorLine,
1300 spec->tcl_interp->result,
1301 err ? err : "[NO ERRORINFO]");
1307 static void execCode (struct lexSpec *spec, struct regxCode *code)
1309 const char *s = code->str;
1311 const char *cmd_str;
1313 r = execTok (spec, &s, &cmd_str, &cmd_len);
1320 r = execTok (spec, &s, &cmd_str, &cmd_len);
1323 p = regxStrz (cmd_str, cmd_len, ptmp);
1324 if (!strcmp (p, "begin"))
1326 r = execTok (spec, &s, &cmd_str, &cmd_len);
1329 yaz_log (YLOG_WARN, "missing keyword after 'begin'");
1332 p = regxStrz (cmd_str, cmd_len, ptmp);
1333 if (!strcmp (p, "record"))
1335 r = execTok (spec, &s, &cmd_str, &cmd_len);
1338 if (spec->d1_level <= 1)
1340 static char absynName[64];
1345 memcpy (absynName, cmd_str, cmd_len);
1346 absynName[cmd_len] = '\0';
1348 yaz_log (YLOG_LOG, "begin record %s", absynName);
1350 res = data1_mk_root (spec->dh, spec->m, absynName);
1354 spec->d1_stack[spec->d1_level++] = res;
1356 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1358 spec->d1_stack[spec->d1_level++] = res;
1360 spec->d1_stack[spec->d1_level] = NULL;
1362 r = execTok (spec, &s, &cmd_str, &cmd_len);
1364 else if (!strcmp (p, "element"))
1366 r = execTok (spec, &s, &cmd_str, &cmd_len);
1369 tagBegin (spec, cmd_str, cmd_len);
1370 r = execTok (spec, &s, &cmd_str, &cmd_len);
1372 else if (!strcmp (p, "variant"))
1375 const char *class_str = NULL;
1377 const char *type_str = NULL;
1379 const char *value_str = NULL;
1380 r = execTok (spec, &s, &cmd_str, &cmd_len);
1383 class_str = cmd_str;
1384 class_len = cmd_len;
1385 r = execTok (spec, &s, &cmd_str, &cmd_len);
1391 r = execTok (spec, &s, &cmd_str, &cmd_len);
1394 value_str = cmd_str;
1395 value_len = cmd_len;
1397 variantBegin (spec, class_str, class_len,
1398 type_str, type_len, value_str, value_len);
1401 r = execTok (spec, &s, &cmd_str, &cmd_len);
1403 else if (!strcmp (p, "context"))
1407 struct lexContext *lc = spec->context;
1408 r = execTok (spec, &s, &cmd_str, &cmd_len);
1409 p = regxStrz (cmd_str, cmd_len, ptmp);
1411 yaz_log (YLOG_LOG, "begin context %s", p);
1413 while (lc && strcmp (p, lc->name))
1416 spec->context_stack[++(spec->context_stack_top)] = lc;
1418 yaz_log (YLOG_WARN, "unknown context %s", p);
1421 r = execTok (spec, &s, &cmd_str, &cmd_len);
1425 yaz_log (YLOG_WARN, "bad keyword '%s' after begin", p);
1428 else if (!strcmp (p, "end"))
1430 r = execTok (spec, &s, &cmd_str, &cmd_len);
1433 yaz_log (YLOG_WARN, "missing keyword after 'end'");
1436 p = regxStrz (cmd_str, cmd_len, ptmp);
1437 if (!strcmp (p, "record"))
1439 while (spec->d1_level)
1441 tagDataRelease (spec);
1444 r = execTok (spec, &s, &cmd_str, &cmd_len);
1446 yaz_log (YLOG_LOG, "end record");
1448 spec->stop_flag = 1;
1450 else if (!strcmp (p, "element"))
1453 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1455 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1460 tagEnd (spec, min_level, cmd_str, cmd_len);
1461 r = execTok (spec, &s, &cmd_str, &cmd_len);
1464 tagEnd (spec, min_level, NULL, 0);
1465 if (spec->d1_level <= 1)
1468 yaz_log (YLOG_LOG, "end element end records");
1470 spec->stop_flag = 1;
1474 else if (!strcmp (p, "context"))
1477 yaz_log (YLOG_LOG, "end context");
1479 if (spec->context_stack_top)
1480 (spec->context_stack_top)--;
1481 r = execTok (spec, &s, &cmd_str, &cmd_len);
1484 yaz_log (YLOG_WARN, "bad keyword '%s' after end", p);
1486 else if (!strcmp (p, "data"))
1490 const char *element_str = NULL;
1492 const char *attribute_str = NULL;
1494 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1496 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1498 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1500 r = execTok (spec, &s, &element_str, &element_len);
1504 else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
1507 r = execTok (spec, &s, &attribute_str, &attribute_len);
1512 yaz_log (YLOG_WARN, "bad data option: %.*s",
1517 yaz_log (YLOG_WARN, "missing data item after data");
1521 tagBegin (spec, element_str, element_len);
1524 execData (spec, cmd_str, cmd_len, textFlag,
1525 attribute_str, attribute_len);
1526 r = execTok (spec, &s, &cmd_str, &cmd_len);
1529 tagEnd (spec, 2, NULL, 0);
1531 else if (!strcmp (p, "unread"))
1534 r = execTok (spec, &s, &cmd_str, &cmd_len);
1535 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1537 r = execTok (spec, &s, &cmd_str, &cmd_len);
1540 yaz_log (YLOG_WARN, "missing number after -offset");
1543 p = regxStrz (cmd_str, cmd_len, ptmp);
1545 r = execTok (spec, &s, &cmd_str, &cmd_len);
1551 yaz_log (YLOG_WARN, "missing index after unread command");
1554 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1556 yaz_log (YLOG_WARN, "bad index after unread command");
1561 no = *cmd_str - '0';
1562 if (no >= spec->arg_no)
1563 no = spec->arg_no - 1;
1564 spec->ptr = spec->arg_start[no] + offset;
1566 r = execTok (spec, &s, &cmd_str, &cmd_len);
1568 else if (!strcmp (p, "context"))
1572 struct lexContext *lc = spec->context;
1573 r = execTok (spec, &s, &cmd_str, &cmd_len);
1574 p = regxStrz (cmd_str, cmd_len, ptmp);
1576 while (lc && strcmp (p, lc->name))
1579 spec->context_stack[spec->context_stack_top] = lc;
1581 yaz_log (YLOG_WARN, "unknown context %s", p);
1584 r = execTok (spec, &s, &cmd_str, &cmd_len);
1588 yaz_log (YLOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1589 r = execTok (spec, &s, &cmd_str, &cmd_len);
1594 yaz_log (YLOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1596 r = execTok (spec, &s, &cmd_str, &cmd_len);
1603 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1604 int start_ptr, int *pptr)
1613 arg_start[0] = start_ptr;
1615 spec->arg_start = arg_start;
1616 spec->arg_end = arg_end;
1623 if (ap->u.pattern.body)
1625 arg_start[arg_no] = *pptr;
1626 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
1628 arg_end[arg_no] = F_WIN_EOF;
1630 arg_start[arg_no] = F_WIN_EOF;
1631 arg_end[arg_no] = F_WIN_EOF;
1632 yaz_log(YLOG_DEBUG, "Pattern match rest of record");
1637 arg_end[arg_no] = sptr;
1639 arg_start[arg_no] = sptr;
1640 arg_end[arg_no] = *pptr;
1645 arg_start[arg_no] = *pptr;
1646 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
1648 if (sptr != arg_start[arg_no])
1650 arg_end[arg_no] = *pptr;
1655 spec->arg_no = arg_no;
1658 if (spec->tcl_interp)
1659 execTcl(spec, ap->u.code);
1661 execCode (spec, ap->u.code);
1663 execCode (spec, ap->u.code);
1666 if (spec->stop_flag)
1670 arg_start[arg_no] = *pptr;
1671 arg_end[arg_no] = F_WIN_EOF;
1680 static int execRule (struct lexSpec *spec, struct lexContext *context,
1681 int ruleNo, int start_ptr, int *pptr)
1684 yaz_log (YLOG_LOG, "exec rule %d", ruleNo);
1686 return execAction (spec, context->fastRule[ruleNo]->actionList,
1690 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1692 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1693 struct DFA_state *state = context->dfa->states[0];
1696 unsigned char c_prev = '\n';
1698 int last_rule = 0; /* rule number of current match */
1699 int last_ptr = *ptr; /* last char of match */
1700 int start_ptr = *ptr; /* first char of match */
1701 int skip_ptr = *ptr; /* first char of run */
1705 c = f_win_advance (spec, ptr);
1706 if (*ptr == F_WIN_EOF)
1708 /* end of file met */
1711 /* there was a match */
1712 if (skip_ptr < start_ptr)
1714 /* deal with chars that didn't match */
1717 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1718 execDataP (spec, buf, size, 0);
1720 /* restore pointer */
1723 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1725 /* restore skip pointer */
1729 else if (skip_ptr < *ptr)
1731 /* deal with chars that didn't match */
1734 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1735 execDataP (spec, buf, size, 0);
1737 if (*ptr == F_WIN_EOF)
1744 { /* no transition for character c ... */
1747 if (skip_ptr < start_ptr)
1749 /* deal with chars that didn't match */
1752 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1753 execDataP (spec, buf, size, 0);
1755 /* restore pointer */
1757 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1759 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1762 yaz_log (YLOG_LOG, "regx: endf ptr=%d", *ptr);
1764 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1768 context = spec->context_stack[spec->context_stack_top];
1771 last_ptr = start_ptr = *ptr;
1775 c_prev = f_win_advance (spec, &start_ptr);
1780 c_prev = f_win_advance (spec, &start_ptr);
1783 state = context->dfa->states[0];
1786 else if (c >= t->ch[0] && c <= t->ch[1])
1787 { /* transition ... */
1788 state = context->dfa->states[t->to];
1793 last_rule = state->rule_no;
1796 else if (state->rule_nno)
1798 last_rule = state->rule_nno;
1810 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1811 const char *context_name)
1813 struct lexContext *lt = spec->context;
1816 spec->stop_flag = 0;
1818 spec->context_stack_top = 0;
1821 if (!strcmp (lt->name, context_name))
1827 yaz_log (YLOG_WARN, "cannot find context %s", context_name);
1830 spec->context_stack[spec->context_stack_top] = lt;
1831 spec->d1_stack[spec->d1_level] = NULL;
1836 execAction (spec, lt->initActionList, ptr, &ptr);
1839 execAction (spec, lt->beginActionList, ptr, &ptr);
1840 lexNode (spec, &ptr);
1841 while (spec->d1_level)
1843 tagDataRelease (spec);
1846 execAction (spec, lt->endActionList, ptr, &ptr);
1847 return spec->d1_stack[0];
1850 void grs_destroy(void *clientData)
1852 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1855 lexSpecDestroy(&specs->spec);
1860 void *grs_init(Res res, RecType recType)
1862 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1864 strcpy(specs->type, "");
1869 ZEBRA_RES grs_config(void *clientData, Res res, const char *args)
1871 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1872 if (strlen(args) < sizeof(specs->type))
1873 strcpy(specs->type, args);
1877 data1_node *grs_read_regx (struct grs_read_info *p)
1880 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1881 struct lexSpec **curLexSpec = &specs->spec;
1884 yaz_log (YLOG_LOG, "grs_read_regx");
1886 if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1889 lexSpecDestroy (curLexSpec);
1890 *curLexSpec = lexSpecCreate (specs->type, p->dh);
1891 res = readFileSpec (*curLexSpec);
1894 lexSpecDestroy (curLexSpec);
1898 (*curLexSpec)->dh = p->dh;
1901 (*curLexSpec)->f_win_start = 0;
1902 (*curLexSpec)->f_win_end = 0;
1903 (*curLexSpec)->f_win_rf = p->readf;
1904 (*curLexSpec)->f_win_sf = p->seekf;
1905 (*curLexSpec)->f_win_fh = p->fh;
1906 (*curLexSpec)->f_win_ef = p->endf;
1907 (*curLexSpec)->f_win_size = 500000;
1909 (*curLexSpec)->m = p->mem;
1910 return lexRoot (*curLexSpec, p->offset, "main");
1913 static int extract_regx(void *clientData, struct recExtractCtrl *ctrl)
1915 return zebra_grs_extract(clientData, ctrl, grs_read_regx);
1918 static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl)
1920 return zebra_grs_retrieve(clientData, ctrl, grs_read_regx);
1923 static struct recType regx_type = {
1935 data1_node *grs_read_tcl (struct grs_read_info *p)
1938 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1939 struct lexSpec **curLexSpec = &specs->spec;
1942 yaz_log (YLOG_LOG, "grs_read_tcl");
1944 if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1946 Tcl_Interp *tcl_interp;
1948 lexSpecDestroy (curLexSpec);
1949 *curLexSpec = lexSpecCreate (specs->type, p->dh);
1950 Tcl_FindExecutable("");
1951 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1952 Tcl_Init(tcl_interp);
1953 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1954 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1955 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1956 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1958 res = readFileSpec (*curLexSpec);
1961 lexSpecDestroy (curLexSpec);
1965 (*curLexSpec)->dh = p->dh;
1968 (*curLexSpec)->f_win_start = 0;
1969 (*curLexSpec)->f_win_end = 0;
1970 (*curLexSpec)->f_win_rf = p->readf;
1971 (*curLexSpec)->f_win_sf = p->seekf;
1972 (*curLexSpec)->f_win_fh = p->fh;
1973 (*curLexSpec)->f_win_ef = p->endf;
1974 (*curLexSpec)->f_win_size = 500000;
1976 (*curLexSpec)->m = p->mem;
1977 return lexRoot (*curLexSpec, p->offset, "main");
1980 static int extract_tcl(void *clientData, struct recExtractCtrl *ctrl)
1982 return zebra_grs_extract(clientData, ctrl, grs_read_tcl);
1985 static int retrieve_tcl(void *clientData, struct recRetrieveCtrl *ctrl)
1987 return zebra_grs_retrieve(clientData, ctrl, grs_read_tcl);
1990 static struct recType tcl_type = {
2003 #ifdef IDZEBRA_STATIC_GRS_REGX
2004 idzebra_filter_grs_regx