2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.32 1999-09-07 07:19:21 adam
8 * Work on character mapping. Implemented replace rules.
10 * Revision 1.31 1999/07/14 13:05:29 adam
11 * Tcl filter works with objects when TCL is version 8 or later; filter
12 * works with strings otherwise (slow).
14 * Revision 1.30 1999/07/14 10:55:28 adam
17 * Revision 1.29 1999/07/12 07:27:54 adam
18 * Improved speed of Tcl processing. Fixed one memory leak.
20 * Revision 1.28 1999/07/06 12:26:04 adam
21 * Fixed filters so that MS-DOS CR is ignored.
23 * Revision 1.27 1999/06/28 13:25:40 quinn
24 * Improved diagnostics for Tcl
26 * Revision 1.26 1999/05/26 07:49:14 adam
29 * Revision 1.25 1999/05/25 12:33:32 adam
30 * Fixed bug in Tcl filter.
32 * Revision 1.24 1999/05/21 11:08:46 adam
33 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
34 * script so that it reads uninstalled Tcl source.
36 * Revision 1.23 1999/05/20 12:57:18 adam
37 * Implemented TCL filter. Updated recctrl system.
39 * Revision 1.22 1998/11/03 16:07:13 adam
42 * Revision 1.21 1998/11/03 15:43:39 adam
43 * Fixed bug introduced by previous commit.
45 * Revision 1.20 1998/11/03 14:51:28 adam
46 * Changed code so that it creates as few data1 nodes as possible.
48 * Revision 1.19 1998/11/03 10:22:39 adam
49 * Fixed memory leak that could occur for when large data1 node were
50 * concatenated. Data-type data1_nodes may have multiple nodes.
52 * Revision 1.18 1998/10/15 13:11:47 adam
53 * Added support for option -record for "end element". When specified
54 * end element will mark end-of-record when at outer-level.
56 * Revision 1.17 1998/07/01 10:13:51 adam
59 * Revision 1.16 1998/06/30 15:15:09 adam
60 * Tags are trimmed: white space removed before- and after the tag.
62 * Revision 1.15 1998/06/30 12:55:45 adam
65 * Revision 1.14 1998/03/05 08:41:00 adam
66 * Implemented rule contexts.
68 * Revision 1.13 1997/12/12 06:33:58 adam
69 * Fixed bug that showed up when multiple filter where used.
70 * Made one routine thread-safe.
72 * Revision 1.12 1997/11/18 10:03:24 adam
73 * Member num_children removed from data1_node.
75 * Revision 1.11 1997/11/06 11:41:01 adam
76 * Implemented "begin variant" for the sgml.regx filter.
78 * Revision 1.10 1997/10/31 12:36:12 adam
79 * Minor change that avoids compiler warning.
81 * Revision 1.9 1997/09/29 09:02:49 adam
82 * Fixed small bug (introduced by previous commit).
84 * Revision 1.8 1997/09/17 12:19:22 adam
85 * Zebra version corresponds to YAZ version 1.4.
86 * Changed Zebra server so that it doesn't depend on global common_resource.
88 * Revision 1.7 1997/07/15 16:33:07 adam
89 * Check for zero length in execData.
91 * Revision 1.6 1997/02/24 10:41:51 adam
92 * Cleanup of code and commented out the "end element-end-record" code.
94 * Revision 1.5 1997/02/19 16:22:33 adam
95 * Fixed "end element" to terminate record in outer-most level.
97 * Revision 1.4 1997/02/12 20:42:58 adam
98 * Changed some log messages.
100 * Revision 1.3 1996/11/08 14:05:33 adam
101 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
103 * Revision 1.2 1996/10/29 14:02:09 adam
104 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
105 * data1_get_tabpath is used.
107 * Revision 1.1 1996/10/11 10:57:30 adam
108 * New module recctrl. Used to manage records (extract/retrieval).
110 * Revision 1.24 1996/06/17 14:25:31 adam
111 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
113 * Revision 1.23 1996/06/04 10:19:00 adam
114 * Minor changes - removed include of ctype.h.
116 * Revision 1.22 1996/06/03 15:23:13 adam
117 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
119 * Revision 1.21 1996/05/14 16:58:38 adam
122 * Revision 1.20 1996/05/01 13:46:36 adam
123 * First work on multiple records in one file.
124 * New option, -offset, to the "unread" command in the filter module.
126 * Revision 1.19 1996/02/12 16:18:20 adam
127 * Yet another bug fix in implementation of unread command.
129 * Revision 1.18 1996/02/12 16:07:54 adam
130 * Bug fix in new unread command.
132 * Revision 1.17 1996/02/12 15:56:11 adam
133 * New code command: unread.
135 * Revision 1.16 1996/01/17 14:57:51 adam
136 * Prototype changed for reader functions in extract/retrieve. File
137 * is identified by 'void *' instead of 'int.
139 * Revision 1.15 1996/01/08 19:15:47 adam
140 * New input filter that works!
142 * Revision 1.14 1996/01/08 09:10:38 adam
143 * Yet another complete rework on this module.
145 * Revision 1.13 1995/12/15 17:21:50 adam
146 * This version is able to set data.formatted_text in data1-nodes.
148 * Revision 1.12 1995/12/15 16:20:10 adam
149 * The filter files (*.flt) are read from the path given by data1_tabpath.
151 * Revision 1.11 1995/12/15 12:35:16 adam
154 * Revision 1.10 1995/12/15 10:35:36 adam
157 * Revision 1.9 1995/12/14 16:38:48 adam
158 * Completely new attempt to make regular expression parsing.
160 * Revision 1.8 1995/12/13 17:16:59 adam
163 * Revision 1.7 1995/12/13 16:51:58 adam
164 * Modified to set last_child in data1_nodes.
165 * Uses destroy handler to free up data text nodes.
167 * Revision 1.6 1995/12/13 13:45:37 quinn
168 * Changed data1 to use nmem.
170 * Revision 1.5 1995/12/11 09:12:52 adam
171 * The rec_get function returns NULL if record doesn't exist - will
172 * happen in the server if the result set records have been deleted since
173 * the creation of the set (i.e. the search).
174 * The server saves a result temporarily if it is 'volatile', i.e. the
175 * set is register dependent.
177 * Revision 1.4 1995/12/05 16:57:40 adam
178 * More work on regular patterns.
180 * Revision 1.3 1995/12/05 09:37:09 adam
181 * One malloc was renamed to xmalloc.
183 * Revision 1.2 1995/12/04 17:59:24 adam
184 * More work on regular expression conversion.
186 * Revision 1.1 1995/12/04 14:25:30 adam
187 * Started work on regular expression parsed input to structured records.
196 #include <zebrautl.h>
203 #if MAJOR_VERSION >= 8
204 #define HAVE_TCL_OBJECTS
210 #define F_WIN_EOF 2000000000
214 #define REGX_PATTERN 1
219 #define REGX_CONTEXT 6
229 struct lexRuleAction {
233 struct DFA *dfa; /* REGX_PATTERN */
236 struct regxCode *code; /* REGX_CODE */
238 struct lexRuleAction *next;
243 struct lexRuleAction *actionList;
247 struct lexRuleInfo info;
248 struct lexRule *next;
254 struct lexRule *rules;
255 struct lexRuleInfo **fastRule;
259 struct lexRuleAction *beginActionList;
260 struct lexRuleAction *endActionList;
261 struct lexRuleAction *initActionList;
262 struct lexContext *next;
265 struct lexConcatBuf {
272 struct lexContext *context;
274 struct lexContext **context_stack;
275 int context_stack_size;
276 int context_stack_top;
282 Tcl_Interp *tcl_interp;
285 void (*f_win_ef)(void *, off_t);
287 int f_win_start; /* first byte of buffer is this file offset */
288 int f_win_end; /* last byte of buffer is this offset - 1 */
289 int f_win_size; /* size of buffer */
290 char *f_win_buf; /* buffer itself */
291 int (*f_win_rf)(void *, char *, size_t);
292 off_t (*f_win_sf)(void *, off_t);
294 struct lexConcatBuf *concatBuf;
296 data1_node **d1_stack;
307 struct lexSpec *spec;
310 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
313 int i, r, off = start_pos - spec->f_win_start;
315 if (off >= 0 && end_pos <= spec->f_win_end)
317 *size = end_pos - start_pos;
318 return spec->f_win_buf + off;
320 if (off < 0 || start_pos >= spec->f_win_end)
322 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
323 spec->f_win_start = start_pos;
325 if (!spec->f_win_buf)
326 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
327 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
329 spec->f_win_end = spec->f_win_start + *size;
331 if (*size > end_pos - start_pos)
332 *size = end_pos - start_pos;
333 return spec->f_win_buf;
335 for (i = 0; i<spec->f_win_end - start_pos; i++)
336 spec->f_win_buf[i] = spec->f_win_buf[i + off];
337 r = (*spec->f_win_rf)(spec->f_win_fh,
339 spec->f_win_size - i);
340 spec->f_win_start = start_pos;
341 spec->f_win_end += r;
343 if (*size > end_pos - start_pos)
344 *size = end_pos - start_pos;
345 return spec->f_win_buf;
348 static int f_win_advance (struct lexSpec *spec, int *pos)
353 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
354 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
355 if (*pos == F_WIN_EOF)
357 buf = f_win_get (spec, *pos, *pos+1, &size);
367 static void regxCodeDel (struct regxCode **pp)
369 struct regxCode *p = *pp;
374 Tcl_DecrRefCount (p->tcl_obj);
382 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
386 p = (struct regxCode *) xmalloc (sizeof(*p));
387 p->str = (char *) xmalloc (len+1);
388 memcpy (p->str, buf, len);
391 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
393 Tcl_IncrRefCount (p->tcl_obj);
398 static struct DFA *lexSpecDFA (void)
403 dfa_parse_cmap_del (dfa, ' ');
404 dfa_parse_cmap_del (dfa, '\t');
405 dfa_parse_cmap_add (dfa, '/', 0);
409 static void actionListDel (struct lexRuleAction **rap)
411 struct lexRuleAction *ra1, *ra;
413 for (ra = *rap; ra; ra = ra1)
419 dfa_delete (&ra->u.pattern.dfa);
422 regxCodeDel (&ra->u.code);
430 static struct lexContext *lexContextCreate (const char *name)
432 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
434 p->name = xstrdup (name);
437 p->dfa = lexSpecDFA ();
440 p->beginActionList = NULL;
441 p->endActionList = NULL;
442 p->initActionList = NULL;
447 static void lexContextDestroy (struct lexContext *p)
449 struct lexRule *rp, *rp1;
451 dfa_delete (&p->dfa);
453 for (rp = p->rules; rp; rp = rp1)
456 actionListDel (&rp->info.actionList);
459 actionListDel (&p->beginActionList);
460 actionListDel (&p->endActionList);
461 actionListDel (&p->initActionList);
466 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
471 p = (struct lexSpec *) xmalloc (sizeof(*p));
472 p->name = (char *) xmalloc (strlen(name)+1);
473 strcpy (p->name, name);
480 p->context_stack_size = 100;
481 p->context_stack = (struct lexContext **)
482 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
486 p->concatBuf = (struct lexConcatBuf *)
487 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
488 for (i = 0; i < p->maxLevel; i++)
490 p->concatBuf[i].max = 0;
491 p->concatBuf[i].buf = 0;
493 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
498 static void lexSpecDestroy (struct lexSpec **pp)
501 struct lexContext *lt;
509 for (i = 0; i < p->maxLevel; i++)
510 xfree (p->concatBuf[i].buf);
511 xfree (p->concatBuf);
516 struct lexContext *lt_next = lt->next;
517 lexContextDestroy (lt);
522 Tcl_DeleteInterp (p->tcl_interp);
525 xfree (p->f_win_buf);
526 xfree (p->context_stack);
532 static int readParseToken (const char **cpp, int *len)
534 const char *cp = *cpp;
538 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
567 if (*cp >= 'a' && *cp <= 'z')
569 else if (*cp >= 'A' && *cp <= 'Z')
570 cmd[i] = *cp + 'a' - 'A';
573 if (i < (int) sizeof(cmd)-2)
580 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
582 while (*cp && *cp != ' ' && *cp != '\t' &&
583 *cp != '\n' && *cp != '\r')
589 if (!strcmp (cmd, "begin"))
591 else if (!strcmp (cmd, "end"))
593 else if (!strcmp (cmd, "body"))
595 else if (!strcmp (cmd, "context"))
597 else if (!strcmp (cmd, "init"))
601 logf (LOG_WARN, "bad command %s", cmd);
607 static int actionListMk (struct lexSpec *spec, const char *s,
608 struct lexRuleAction **ap)
614 while ((tok = readParseToken (&s, &len)))
622 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
624 regxCodeMk (&(*ap)->u.code, s, len);
628 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
630 (*ap)->u.pattern.body = bodyMark;
632 (*ap)->u.pattern.dfa = lexSpecDFA ();
634 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
639 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
642 dfa_mkstate ((*ap)->u.pattern.dfa);
646 logf (LOG_WARN, "cannot use BEGIN here");
649 logf (LOG_WARN, "cannot use INIT here");
652 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
662 int readOneSpec (struct lexSpec *spec, const char *s)
666 struct lexContext *lc;
668 tok = readParseToken (&s, &len);
669 if (tok == REGX_CONTEXT)
671 char context_name[32];
672 tok = readParseToken (&s, &len);
673 if (tok != REGX_CODE)
675 logf (LOG_WARN, "missing name after CONTEXT keyword");
680 memcpy (context_name, s, len);
681 context_name[len] = '\0';
682 lc = lexContextCreate (context_name);
683 lc->next = spec->context;
688 spec->context = lexContextCreate ("main");
693 actionListDel (&spec->context->beginActionList);
694 actionListMk (spec, s, &spec->context->beginActionList);
697 actionListDel (&spec->context->endActionList);
698 actionListMk (spec, s, &spec->context->endActionList);
701 actionListDel (&spec->context->initActionList);
702 actionListMk (spec, s, &spec->context->initActionList);
706 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
708 r = dfa_parse (spec->context->dfa, &s);
711 logf (LOG_WARN, "regular expression error. r=%d", r);
716 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
720 rp = (struct lexRule *) xmalloc (sizeof(*rp));
721 rp->info.no = spec->context->ruleNo++;
722 rp->next = spec->context->rules;
723 spec->context->rules = rp;
724 actionListMk (spec, s, &rp->info.actionList);
729 int readFileSpec (struct lexSpec *spec)
731 struct lexContext *lc;
732 int c, i, errors = 0;
738 if (spec->tcl_interp)
740 sprintf (fname, "%s.tflt", spec->name);
741 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
746 sprintf (fname, "%s.flt", spec->name);
747 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
751 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
754 logf (LOG_LOG, "reading regx filter %s", fname);
756 if (spec->tcl_interp)
757 logf (LOG_LOG, "Tcl enabled");
759 lineBuf = wrbuf_alloc();
764 wrbuf_rewind (lineBuf);
765 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
767 while (c != '\n' && c != EOF)
780 wrbuf_putc(lineBuf, c);
788 if (c != ' ' && c != '\t')
793 wrbuf_putc(lineBuf, '\0');
794 readOneSpec (spec, wrbuf_buf(lineBuf));
795 spec->lineNo += addLine;
799 wrbuf_free(lineBuf, 1);
804 debug_dfa_followpos = 1;
807 for (lc = spec->context; lc; lc = lc->next)
810 lc->fastRule = (struct lexRuleInfo **)
811 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
812 for (i = 0; i < lc->ruleNo; i++)
813 lc->fastRule[i] = NULL;
814 for (rp = lc->rules; rp; rp = rp->next)
815 lc->fastRule[rp->info.no] = &rp->info;
816 dfa_mkstate (lc->dfa);
825 static struct lexSpec *curLexSpec = NULL;
828 static void execData (struct lexSpec *spec,
829 const char *ebuf, int elen, int formatted_text)
831 struct data1_node *res, *parent;
834 if (elen == 0) /* shouldn't happen, but it does! */
838 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
839 ebuf, 15, ebuf + elen-15);
841 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
843 logf (LOG_DEBUG, "data (%d bytes)", elen);
846 if (spec->d1_level <= 1)
849 parent = spec->d1_stack[spec->d1_level -1];
852 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
853 org_len = res->u.data.len;
858 res = data1_mk_node (spec->dh, spec->m);
859 res->parent = parent;
860 res->which = DATA1N_data;
861 res->u.data.what = DATA1I_text;
863 res->u.data.formatted_text = formatted_text;
865 if (elen > DATA1_LOCALDATA)
866 res->u.data.data = nmem_malloc (spec->m, elen);
868 res->u.data.data = res->lbuf;
869 memcpy (res->u.data.data, ebuf, elen);
871 res->u.data.data = 0;
873 res->root = parent->root;
875 parent->last_child = res;
876 if (spec->d1_stack[spec->d1_level])
877 spec->d1_stack[spec->d1_level]->next = res;
880 spec->d1_stack[spec->d1_level] = res;
882 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
884 char *old_buf, *new_buf;
886 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
887 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
888 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
890 memcpy (new_buf, old_buf, org_len);
893 spec->concatBuf[spec->d1_level].buf = new_buf;
895 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
896 res->u.data.len += elen;
899 static void execDataP (struct lexSpec *spec,
900 const char *ebuf, int elen, int formatted_text)
902 execData (spec, ebuf, elen, formatted_text);
905 static void tagDataRelease (struct lexSpec *spec)
909 if ((res = spec->d1_stack[spec->d1_level]) &&
910 res->which == DATA1N_data &&
911 res->u.data.what == DATA1I_text)
913 assert (!res->u.data.data);
914 assert (res->u.data.len > 0);
915 if (res->u.data.len > DATA1_LOCALDATA)
916 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
918 res->u.data.data = res->lbuf;
919 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
924 static void variantBegin (struct lexSpec *spec,
925 const char *class_str, int class_len,
926 const char *type_str, int type_len,
927 const char *value_str, int value_len)
929 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
930 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
935 if (spec->d1_level == 0)
937 logf (LOG_WARN, "in variant begin. No record type defined");
940 if (class_len >= DATA1_MAX_SYMBOL)
941 class_len = DATA1_MAX_SYMBOL-1;
942 memcpy (tclass, class_str, class_len);
943 tclass[class_len] = '\0';
945 if (type_len >= DATA1_MAX_SYMBOL)
946 type_len = DATA1_MAX_SYMBOL-1;
947 memcpy (ttype, type_str, type_len);
948 ttype[type_len] = '\0';
951 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
956 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
960 if (parent->which != DATA1N_variant)
962 res = data1_mk_node (spec->dh, spec->m);
963 res->parent = parent;
964 res->which = DATA1N_variant;
965 res->u.variant.type = 0;
966 res->u.variant.value = 0;
967 res->root = parent->root;
969 parent->last_child = res;
970 if (spec->d1_stack[spec->d1_level])
972 tagDataRelease (spec);
973 spec->d1_stack[spec->d1_level]->next = res;
977 spec->d1_stack[spec->d1_level] = res;
978 spec->d1_stack[++(spec->d1_level)] = NULL;
980 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
981 if (spec->d1_stack[i]->u.variant.type == tp)
988 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
990 parent = spec->d1_stack[spec->d1_level-1];
991 res = data1_mk_node (spec->dh, spec->m);
992 res->parent = parent;
993 res->which = DATA1N_variant;
994 res->root = parent->root;
995 res->u.variant.type = tp;
997 if (value_len >= DATA1_LOCALDATA)
998 value_len =DATA1_LOCALDATA-1;
999 memcpy (res->lbuf, value_str, value_len);
1000 res->lbuf[value_len] = '\0';
1002 res->u.variant.value = res->lbuf;
1004 parent->last_child = res;
1005 if (spec->d1_stack[spec->d1_level])
1007 tagDataRelease (spec);
1008 spec->d1_stack[spec->d1_level]->next = res;
1011 parent->child = res;
1012 spec->d1_stack[spec->d1_level] = res;
1013 spec->d1_stack[++(spec->d1_level)] = NULL;
1016 static void tagStrip (const char **tag, int *len)
1020 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1023 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1029 static void tagBegin (struct lexSpec *spec,
1030 const char *tag, int len)
1032 struct data1_node *parent;
1033 data1_element *elem = NULL;
1036 data1_element *e = NULL;
1039 if (spec->d1_level == 0)
1041 logf (LOG_WARN, "in element begin. No record type defined");
1044 tagStrip (&tag, &len);
1046 parent = spec->d1_stack[spec->d1_level -1];
1047 partag = get_parent_tag(spec->dh, parent);
1049 res = data1_mk_node (spec->dh, spec->m);
1050 res->parent = parent;
1051 res->which = DATA1N_tag;
1052 res->u.tag.get_bytes = -1;
1054 if (len >= DATA1_LOCALDATA)
1055 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1057 res->u.tag.tag = res->lbuf;
1059 memcpy (res->u.tag.tag, tag, len);
1060 res->u.tag.tag[len] = '\0';
1063 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1065 if (parent->which == DATA1N_variant)
1068 if (!(e = partag->u.tag.element))
1071 elem = data1_getelementbytagname (spec->dh,
1072 spec->d1_stack[0]->u.root.absyn,
1074 res->u.tag.element = elem;
1075 res->u.tag.node_selected = 0;
1076 res->u.tag.make_variantlist = 0;
1077 res->u.tag.no_data_requested = 0;
1078 res->root = parent->root;
1080 parent->last_child = res;
1081 if (spec->d1_stack[spec->d1_level])
1083 tagDataRelease (spec);
1084 spec->d1_stack[spec->d1_level]->next = res;
1087 parent->child = res;
1088 spec->d1_stack[spec->d1_level] = res;
1089 spec->d1_stack[++(spec->d1_level)] = NULL;
1092 static void tagEnd (struct lexSpec *spec, int min_level,
1093 const char *tag, int len)
1095 tagStrip (&tag, &len);
1096 while (spec->d1_level > min_level)
1098 tagDataRelease (spec);
1100 if (spec->d1_level == 0)
1102 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1104 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1106 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1110 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1115 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1118 struct DFA_state *state = dfa->states[0];
1121 unsigned char c_prev = 0;
1122 int ptr = *pptr; /* current pointer */
1123 int start_ptr = *pptr; /* first char of match */
1124 int last_ptr = 0; /* last char of match */
1125 int last_rule = 0; /* rule number of current match */
1130 c = f_win_advance (spec, &ptr);
1131 if (ptr == F_WIN_EOF)
1148 *mptr = start_ptr; /* match starts here */
1149 *pptr = last_ptr; /* match end here (+1) */
1152 state = dfa->states[0];
1157 else if (c >= t->ch[0] && c <= t->ch[1])
1159 state = dfa->states[t->to];
1164 last_rule = state->rule_no;
1169 last_rule = state->rule_nno;
1181 static int execTok (struct lexSpec *spec, const char **src,
1182 const char **tokBuf, int *tokLen)
1184 const char *s = *src;
1186 while (*s == ' ' || *s == '\t')
1190 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1194 while (*s >= '0' && *s <= '9')
1195 n = n*10 + (*s++ -'0');
1196 if (spec->arg_no == 0)
1203 if (n >= spec->arg_no)
1205 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1209 else if (*s == '\"')
1212 while (*s && *s != '\"')
1214 *tokLen = s - *tokBuf;
1219 else if (*s == '\n' || *s == ';')
1227 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1230 *tokLen = s - *tokBuf;
1237 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1240 *tokLen = s - *tokBuf;
1246 static char *regxStrz (const char *src, int len, char *str)
1250 memcpy (str, src, len);
1256 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1257 int argc, char **argv)
1259 struct lexSpec *spec = (struct lexSpec *) clientData;
1262 if (!strcmp(argv[1], "record") && argc == 3)
1264 char *absynName = argv[2];
1268 logf (LOG_DEBUG, "begin record %s", absynName);
1270 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1271 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1276 res = data1_mk_node (spec->dh, spec->m);
1277 res->which = DATA1N_root;
1278 res->u.root.type = absynName;
1279 res->u.root.absyn = absyn;
1282 spec->d1_stack[spec->d1_level] = res;
1283 spec->d1_stack[++(spec->d1_level)] = NULL;
1286 else if (!strcmp(argv[1], "element") && argc == 3)
1288 tagBegin (spec, argv[2], strlen(argv[2]));
1290 else if (!strcmp (argv[1], "variant") && argc == 5)
1292 variantBegin (spec, argv[2], strlen(argv[2]),
1293 argv[3], strlen(argv[3]),
1294 argv[4], strlen(argv[4]));
1296 else if (!strcmp (argv[1], "context") && argc == 3)
1298 struct lexContext *lc = spec->context;
1300 logf (LOG_DEBUG, "begin context %s",argv[2]);
1302 while (lc && strcmp (argv[2], lc->name))
1306 spec->context_stack[++(spec->context_stack_top)] = lc;
1309 logf (LOG_WARN, "unknown context %s", argv[2]);
1316 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1317 int argc, char **argv)
1319 struct lexSpec *spec = (struct lexSpec *) clientData;
1323 if (!strcmp (argv[1], "record"))
1325 while (spec->d1_level)
1327 tagDataRelease (spec);
1331 logf (LOG_DEBUG, "end record");
1333 spec->stop_flag = 1;
1335 else if (!strcmp (argv[1], "element"))
1339 if (argc >= 3 && !strcmp(argv[2], "-record"))
1348 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1349 if (spec->d1_level == 0)
1352 logf (LOG_DEBUG, "end element end records");
1354 spec->stop_flag = 1;
1357 else if (!strcmp (argv[1], "context"))
1360 logf (LOG_DEBUG, "end context");
1362 if (spec->context_stack_top)
1363 (spec->context_stack_top)--;
1370 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1371 int argc, char **argv)
1375 const char *element = 0;
1376 struct lexSpec *spec = (struct lexSpec *) clientData;
1380 if (!strcmp("-text", argv[argi]))
1385 else if (!strcmp("-element", argv[argi]))
1389 element = argv[argi++];
1395 tagBegin (spec, element, strlen(element));
1399 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1403 tagEnd (spec, 1, NULL, 0);
1407 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1408 int argc, char **argv)
1410 struct lexSpec *spec = (struct lexSpec *) clientData;
1417 if (!strcmp("-offset", argv[argi]))
1422 offset = atoi(argv[argi]);
1431 no = atoi(argv[argi]);
1432 if (no >= spec->arg_no)
1433 no = spec->arg_no - 1;
1434 spec->ptr = spec->arg_start[no] + offset;
1438 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1442 for (i = 0; i < spec->arg_no; i++)
1444 char var_name[10], *var_buf;
1447 sprintf (var_name, "%d", i);
1448 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1452 ch = var_buf[var_len];
1453 var_buf[var_len] = '\0';
1454 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1455 var_buf[var_len] = ch;
1458 #if HAVE_TCL_OBJECTS
1459 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1461 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1465 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1466 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1467 spec->tcl_interp->errorLine,
1468 spec->tcl_interp->result,
1469 err ? err : "[NO ERRORINFO]");
1475 static void execCode (struct lexSpec *spec, struct regxCode *code)
1477 const char *s = code->str;
1479 const char *cmd_str;
1481 r = execTok (spec, &s, &cmd_str, &cmd_len);
1488 r = execTok (spec, &s, &cmd_str, &cmd_len);
1491 p = regxStrz (cmd_str, cmd_len, ptmp);
1492 if (!strcmp (p, "begin"))
1494 r = execTok (spec, &s, &cmd_str, &cmd_len);
1497 logf (LOG_WARN, "missing keyword after 'begin'");
1500 p = regxStrz (cmd_str, cmd_len, ptmp);
1501 if (!strcmp (p, "record"))
1503 r = execTok (spec, &s, &cmd_str, &cmd_len);
1506 if (spec->d1_level == 0)
1508 static char absynName[64];
1513 memcpy (absynName, cmd_str, cmd_len);
1514 absynName[cmd_len] = '\0';
1517 logf (LOG_DEBUG, "begin record %s", absynName);
1519 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1520 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1525 res = data1_mk_node (spec->dh, spec->m);
1526 res->which = DATA1N_root;
1527 res->u.root.type = absynName;
1528 res->u.root.absyn = absyn;
1531 spec->d1_stack[spec->d1_level] = res;
1532 spec->d1_stack[++(spec->d1_level)] = NULL;
1535 r = execTok (spec, &s, &cmd_str, &cmd_len);
1537 else if (!strcmp (p, "element"))
1539 r = execTok (spec, &s, &cmd_str, &cmd_len);
1542 tagBegin (spec, cmd_str, cmd_len);
1543 r = execTok (spec, &s, &cmd_str, &cmd_len);
1545 else if (!strcmp (p, "variant"))
1548 const char *class_str = NULL;
1550 const char *type_str = NULL;
1552 const char *value_str = NULL;
1553 r = execTok (spec, &s, &cmd_str, &cmd_len);
1556 class_str = cmd_str;
1557 class_len = cmd_len;
1558 r = execTok (spec, &s, &cmd_str, &cmd_len);
1564 r = execTok (spec, &s, &cmd_str, &cmd_len);
1567 value_str = cmd_str;
1568 value_len = cmd_len;
1570 variantBegin (spec, class_str, class_len,
1571 type_str, type_len, value_str, value_len);
1574 r = execTok (spec, &s, &cmd_str, &cmd_len);
1576 else if (!strcmp (p, "context"))
1580 struct lexContext *lc = spec->context;
1581 r = execTok (spec, &s, &cmd_str, &cmd_len);
1582 p = regxStrz (cmd_str, cmd_len, ptmp);
1584 logf (LOG_DEBUG, "begin context %s", p);
1586 while (lc && strcmp (p, lc->name))
1589 spec->context_stack[++(spec->context_stack_top)] = lc;
1591 logf (LOG_WARN, "unknown context %s", p);
1594 r = execTok (spec, &s, &cmd_str, &cmd_len);
1598 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1601 else if (!strcmp (p, "end"))
1603 r = execTok (spec, &s, &cmd_str, &cmd_len);
1606 logf (LOG_WARN, "missing keyword after 'end'");
1609 p = regxStrz (cmd_str, cmd_len, ptmp);
1610 if (!strcmp (p, "record"))
1612 while (spec->d1_level)
1614 tagDataRelease (spec);
1617 r = execTok (spec, &s, &cmd_str, &cmd_len);
1619 logf (LOG_DEBUG, "end record");
1621 spec->stop_flag = 1;
1623 else if (!strcmp (p, "element"))
1626 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1628 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1633 tagEnd (spec, min_level, cmd_str, cmd_len);
1634 r = execTok (spec, &s, &cmd_str, &cmd_len);
1637 tagEnd (spec, min_level, NULL, 0);
1638 if (spec->d1_level == 0)
1641 logf (LOG_DEBUG, "end element end records");
1643 spec->stop_flag = 1;
1647 else if (!strcmp (p, "context"))
1650 logf (LOG_DEBUG, "end context");
1652 if (spec->context_stack_top)
1653 (spec->context_stack_top)--;
1654 r = execTok (spec, &s, &cmd_str, &cmd_len);
1657 logf (LOG_WARN, "bad keyword '%s' after end", p);
1659 else if (!strcmp (p, "data"))
1663 const char *element_str = NULL;
1665 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1667 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1669 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1671 r = execTok (spec, &s, &element_str, &element_len);
1676 logf (LOG_WARN, "bad data option: %.*s",
1681 logf (LOG_WARN, "missing data item after data");
1685 tagBegin (spec, element_str, element_len);
1688 execData (spec, cmd_str, cmd_len,textFlag);
1689 r = execTok (spec, &s, &cmd_str, &cmd_len);
1692 tagEnd (spec, 1, NULL, 0);
1694 else if (!strcmp (p, "unread"))
1697 r = execTok (spec, &s, &cmd_str, &cmd_len);
1698 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1700 r = execTok (spec, &s, &cmd_str, &cmd_len);
1703 logf (LOG_WARN, "missing number after -offset");
1706 p = regxStrz (cmd_str, cmd_len, ptmp);
1708 r = execTok (spec, &s, &cmd_str, &cmd_len);
1714 logf (LOG_WARN, "missing index after unread command");
1717 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1719 logf (LOG_WARN, "bad index after unread command");
1724 no = *cmd_str - '0';
1725 if (no >= spec->arg_no)
1726 no = spec->arg_no - 1;
1727 spec->ptr = spec->arg_start[no] + offset;
1729 r = execTok (spec, &s, &cmd_str, &cmd_len);
1731 else if (!strcmp (p, "context"))
1735 struct lexContext *lc = spec->context;
1736 r = execTok (spec, &s, &cmd_str, &cmd_len);
1737 p = regxStrz (cmd_str, cmd_len, ptmp);
1739 while (lc && strcmp (p, lc->name))
1742 spec->context_stack[spec->context_stack_top] = lc;
1744 logf (LOG_WARN, "unknown context %s", p);
1747 r = execTok (spec, &s, &cmd_str, &cmd_len);
1751 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1752 r = execTok (spec, &s, &cmd_str, &cmd_len);
1757 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1759 r = execTok (spec, &s, &cmd_str, &cmd_len);
1766 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1767 int start_ptr, int *pptr)
1776 arg_start[0] = start_ptr;
1778 spec->arg_start = arg_start;
1779 spec->arg_end = arg_end;
1786 if (ap->u.pattern.body)
1788 arg_start[arg_no] = *pptr;
1789 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1791 arg_end[arg_no] = F_WIN_EOF;
1793 arg_start[arg_no] = F_WIN_EOF;
1794 arg_end[arg_no] = F_WIN_EOF;
1799 arg_end[arg_no] = sptr;
1801 arg_start[arg_no] = sptr;
1802 arg_end[arg_no] = *pptr;
1807 arg_start[arg_no] = *pptr;
1808 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1810 if (sptr != arg_start[arg_no])
1812 arg_end[arg_no] = *pptr;
1817 spec->arg_no = arg_no;
1820 if (spec->tcl_interp)
1821 execTcl(spec, ap->u.code);
1823 execCode (spec, ap->u.code);
1825 execCode (spec, ap->u.code);
1828 if (spec->stop_flag)
1832 arg_start[arg_no] = *pptr;
1833 arg_end[arg_no] = F_WIN_EOF;
1842 static int execRule (struct lexSpec *spec, struct lexContext *context,
1843 int ruleNo, int start_ptr, int *pptr)
1846 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1848 return execAction (spec, context->fastRule[ruleNo]->actionList,
1852 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1854 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1855 struct DFA_state *state = context->dfa->states[0];
1858 unsigned char c_prev = '\n';
1860 int last_rule = 0; /* rule number of current match */
1861 int last_ptr = *ptr; /* last char of match */
1862 int start_ptr = *ptr; /* first char of match */
1863 int skip_ptr = *ptr; /* first char of run */
1867 c = f_win_advance (spec, ptr);
1868 if (*ptr == F_WIN_EOF)
1870 /* end of file met */
1873 /* there was a match */
1874 if (skip_ptr < start_ptr)
1876 /* deal with chars that didn't match */
1879 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1880 execDataP (spec, buf, size, 0);
1882 /* restore pointer */
1885 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1887 /* restore skip pointer */
1891 else if (skip_ptr < *ptr)
1893 /* deal with chars that didn't match */
1896 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1897 execDataP (spec, buf, size, 0);
1899 if (*ptr == F_WIN_EOF)
1906 { /* no transition for character c ... */
1909 if (skip_ptr < start_ptr)
1911 /* deal with chars that didn't match */
1914 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1915 execDataP (spec, buf, size, 0);
1917 /* restore pointer */
1919 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1921 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1924 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1926 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1930 context = spec->context_stack[spec->context_stack_top];
1933 last_ptr = start_ptr = *ptr;
1937 c_prev = f_win_advance (spec, &start_ptr);
1942 c_prev = f_win_advance (spec, &start_ptr);
1945 state = context->dfa->states[0];
1948 else if (c >= t->ch[0] && c <= t->ch[1])
1949 { /* transition ... */
1950 state = context->dfa->states[t->to];
1955 last_rule = state->rule_no;
1958 else if (state->rule_nno)
1960 last_rule = state->rule_nno;
1972 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1973 const char *context_name)
1975 struct lexContext *lt = spec->context;
1978 spec->stop_flag = 0;
1980 spec->context_stack_top = 0;
1983 if (!strcmp (lt->name, context_name))
1989 logf (LOG_WARN, "cannot find context %s", context_name);
1992 spec->context_stack[spec->context_stack_top] = lt;
1993 spec->d1_stack[spec->d1_level] = NULL;
1998 execAction (spec, lt->initActionList, ptr, &ptr);
2001 execAction (spec, lt->beginActionList, ptr, &ptr);
2002 lexNode (spec, &ptr);
2003 while (spec->d1_level)
2005 tagDataRelease (spec);
2008 execAction (spec, lt->endActionList, ptr, &ptr);
2009 return spec->d1_stack[0];
2012 void grs_destroy(void *clientData)
2014 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2017 lexSpecDestroy(&specs->spec);
2022 void *grs_init(void)
2024 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2029 data1_node *grs_read_regx (struct grs_read_info *p)
2032 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2033 struct lexSpec **curLexSpec = &specs->spec;
2036 logf (LOG_DEBUG, "grs_read_regx");
2038 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2041 lexSpecDestroy (curLexSpec);
2042 *curLexSpec = lexSpecCreate (p->type, p->dh);
2043 res = readFileSpec (*curLexSpec);
2046 lexSpecDestroy (curLexSpec);
2050 (*curLexSpec)->dh = p->dh;
2053 (*curLexSpec)->f_win_start = 0;
2054 (*curLexSpec)->f_win_end = 0;
2055 (*curLexSpec)->f_win_rf = p->readf;
2056 (*curLexSpec)->f_win_sf = p->seekf;
2057 (*curLexSpec)->f_win_fh = p->fh;
2058 (*curLexSpec)->f_win_ef = p->endf;
2059 (*curLexSpec)->f_win_size = 500000;
2061 (*curLexSpec)->m = p->mem;
2062 return lexRoot (*curLexSpec, p->offset, "main");
2065 static struct recTypeGrs regx_type = {
2072 RecTypeGrs recTypeGrs_regx = ®x_type;
2075 data1_node *grs_read_tcl (struct grs_read_info *p)
2078 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2079 struct lexSpec **curLexSpec = &specs->spec;
2082 logf (LOG_DEBUG, "grs_read_tcl");
2084 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2086 Tcl_Interp *tcl_interp;
2088 lexSpecDestroy (curLexSpec);
2089 *curLexSpec = lexSpecCreate (p->type, p->dh);
2090 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2091 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2092 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2093 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2094 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2096 res = readFileSpec (*curLexSpec);
2099 lexSpecDestroy (curLexSpec);
2103 (*curLexSpec)->dh = p->dh;
2106 (*curLexSpec)->f_win_start = 0;
2107 (*curLexSpec)->f_win_end = 0;
2108 (*curLexSpec)->f_win_rf = p->readf;
2109 (*curLexSpec)->f_win_sf = p->seekf;
2110 (*curLexSpec)->f_win_fh = p->fh;
2111 (*curLexSpec)->f_win_ef = p->endf;
2112 (*curLexSpec)->f_win_size = 500000;
2114 (*curLexSpec)->m = p->mem;
2115 return lexRoot (*curLexSpec, p->offset, "main");
2118 static struct recTypeGrs tcl_type = {
2125 RecTypeGrs recTypeGrs_tcl = &tcl_type;