2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.25 1999-05-25 12:33:32 adam
8 * Fixed bug in Tcl filter.
10 * Revision 1.24 1999/05/21 11:08:46 adam
11 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
12 * script so that it reads uninstalled Tcl source.
14 * Revision 1.23 1999/05/20 12:57:18 adam
15 * Implemented TCL filter. Updated recctrl system.
17 * Revision 1.22 1998/11/03 16:07:13 adam
20 * Revision 1.21 1998/11/03 15:43:39 adam
21 * Fixed bug introduced by previous commit.
23 * Revision 1.20 1998/11/03 14:51:28 adam
24 * Changed code so that it creates as few data1 nodes as possible.
26 * Revision 1.19 1998/11/03 10:22:39 adam
27 * Fixed memory leak that could occur for when large data1 node were
28 * concatenated. Data-type data1_nodes may have multiple nodes.
30 * Revision 1.18 1998/10/15 13:11:47 adam
31 * Added support for option -record for "end element". When specified
32 * end element will mark end-of-record when at outer-level.
34 * Revision 1.17 1998/07/01 10:13:51 adam
37 * Revision 1.16 1998/06/30 15:15:09 adam
38 * Tags are trimmed: white space removed before- and after the tag.
40 * Revision 1.15 1998/06/30 12:55:45 adam
43 * Revision 1.14 1998/03/05 08:41:00 adam
44 * Implemented rule contexts.
46 * Revision 1.13 1997/12/12 06:33:58 adam
47 * Fixed bug that showed up when multiple filter where used.
48 * Made one routine thread-safe.
50 * Revision 1.12 1997/11/18 10:03:24 adam
51 * Member num_children removed from data1_node.
53 * Revision 1.11 1997/11/06 11:41:01 adam
54 * Implemented "begin variant" for the sgml.regx filter.
56 * Revision 1.10 1997/10/31 12:36:12 adam
57 * Minor change that avoids compiler warning.
59 * Revision 1.9 1997/09/29 09:02:49 adam
60 * Fixed small bug (introduced by previous commit).
62 * Revision 1.8 1997/09/17 12:19:22 adam
63 * Zebra version corresponds to YAZ version 1.4.
64 * Changed Zebra server so that it doesn't depend on global common_resource.
66 * Revision 1.7 1997/07/15 16:33:07 adam
67 * Check for zero length in execData.
69 * Revision 1.6 1997/02/24 10:41:51 adam
70 * Cleanup of code and commented out the "end element-end-record" code.
72 * Revision 1.5 1997/02/19 16:22:33 adam
73 * Fixed "end element" to terminate record in outer-most level.
75 * Revision 1.4 1997/02/12 20:42:58 adam
76 * Changed some log messages.
78 * Revision 1.3 1996/11/08 14:05:33 adam
79 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
81 * Revision 1.2 1996/10/29 14:02:09 adam
82 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
83 * data1_get_tabpath is used.
85 * Revision 1.1 1996/10/11 10:57:30 adam
86 * New module recctrl. Used to manage records (extract/retrieval).
88 * Revision 1.24 1996/06/17 14:25:31 adam
89 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
91 * Revision 1.23 1996/06/04 10:19:00 adam
92 * Minor changes - removed include of ctype.h.
94 * Revision 1.22 1996/06/03 15:23:13 adam
95 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
97 * Revision 1.21 1996/05/14 16:58:38 adam
100 * Revision 1.20 1996/05/01 13:46:36 adam
101 * First work on multiple records in one file.
102 * New option, -offset, to the "unread" command in the filter module.
104 * Revision 1.19 1996/02/12 16:18:20 adam
105 * Yet another bug fix in implementation of unread command.
107 * Revision 1.18 1996/02/12 16:07:54 adam
108 * Bug fix in new unread command.
110 * Revision 1.17 1996/02/12 15:56:11 adam
111 * New code command: unread.
113 * Revision 1.16 1996/01/17 14:57:51 adam
114 * Prototype changed for reader functions in extract/retrieve. File
115 * is identified by 'void *' instead of 'int.
117 * Revision 1.15 1996/01/08 19:15:47 adam
118 * New input filter that works!
120 * Revision 1.14 1996/01/08 09:10:38 adam
121 * Yet another complete rework on this module.
123 * Revision 1.13 1995/12/15 17:21:50 adam
124 * This version is able to set data.formatted_text in data1-nodes.
126 * Revision 1.12 1995/12/15 16:20:10 adam
127 * The filter files (*.flt) are read from the path given by data1_tabpath.
129 * Revision 1.11 1995/12/15 12:35:16 adam
132 * Revision 1.10 1995/12/15 10:35:36 adam
135 * Revision 1.9 1995/12/14 16:38:48 adam
136 * Completely new attempt to make regular expression parsing.
138 * Revision 1.8 1995/12/13 17:16:59 adam
141 * Revision 1.7 1995/12/13 16:51:58 adam
142 * Modified to set last_child in data1_nodes.
143 * Uses destroy handler to free up data text nodes.
145 * Revision 1.6 1995/12/13 13:45:37 quinn
146 * Changed data1 to use nmem.
148 * Revision 1.5 1995/12/11 09:12:52 adam
149 * The rec_get function returns NULL if record doesn't exist - will
150 * happen in the server if the result set records have been deleted since
151 * the creation of the set (i.e. the search).
152 * The server saves a result temporarily if it is 'volatile', i.e. the
153 * set is register dependent.
155 * Revision 1.4 1995/12/05 16:57:40 adam
156 * More work on regular patterns.
158 * Revision 1.3 1995/12/05 09:37:09 adam
159 * One malloc was renamed to xmalloc.
161 * Revision 1.2 1995/12/04 17:59:24 adam
162 * More work on regular expression conversion.
164 * Revision 1.1 1995/12/04 14:25:30 adam
165 * Started work on regular expression parsed input to structured records.
174 #include <zebrautl.h>
184 #define F_WIN_EOF 2000000000
188 #define REGX_PATTERN 1
193 #define REGX_CONTEXT 6
200 struct lexRuleAction {
204 struct DFA *dfa; /* REGX_PATTERN */
207 struct regxCode *code; /* REGX_CODE */
209 struct lexRuleAction *next;
214 struct lexRuleAction *actionList;
218 struct lexRuleInfo info;
219 struct lexRule *next;
225 struct lexRule *rules;
226 struct lexRuleInfo **fastRule;
230 struct lexRuleAction *beginActionList;
231 struct lexRuleAction *endActionList;
232 struct lexRuleAction *initActionList;
233 struct lexContext *next;
236 struct lexConcatBuf {
244 struct lexContext *context;
246 struct lexContext **context_stack;
247 int context_stack_size;
248 int context_stack_top;
254 Tcl_Interp *tcl_interp;
257 void (*f_win_ef)(void *, off_t);
259 int f_win_start; /* first byte of buffer is this file offset */
260 int f_win_end; /* last byte of buffer is this offset - 1 */
261 int f_win_size; /* size of buffer */
262 char *f_win_buf; /* buffer itself */
263 int (*f_win_rf)(void *, char *, size_t);
264 off_t (*f_win_sf)(void *, off_t);
266 struct lexConcatBuf **concatBuf;
268 data1_node **d1_stack;
279 struct lexSpec *spec;
282 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
285 int i, r, off = start_pos - spec->f_win_start;
287 if (off >= 0 && end_pos <= spec->f_win_end)
289 *size = end_pos - start_pos;
290 return spec->f_win_buf + off;
292 if (off < 0 || start_pos >= spec->f_win_end)
294 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
295 spec->f_win_start = start_pos;
297 if (!spec->f_win_buf)
298 spec->f_win_buf = xmalloc (spec->f_win_size);
299 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
301 spec->f_win_end = spec->f_win_start + *size;
303 if (*size > end_pos - start_pos)
304 *size = end_pos - start_pos;
305 return spec->f_win_buf;
307 for (i = 0; i<spec->f_win_end - start_pos; i++)
308 spec->f_win_buf[i] = spec->f_win_buf[i + off];
309 r = (*spec->f_win_rf)(spec->f_win_fh,
311 spec->f_win_size - i);
312 spec->f_win_start = start_pos;
313 spec->f_win_end += r;
315 if (*size > end_pos - start_pos)
316 *size = end_pos - start_pos;
317 return spec->f_win_buf;
320 static int f_win_advance (struct lexSpec *spec, int *pos)
325 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
326 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
327 if (*pos == F_WIN_EOF)
329 buf = f_win_get (spec, *pos, *pos+1, &size);
339 static void regxCodeDel (struct regxCode **pp)
341 struct regxCode *p = *pp;
350 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
354 p = xmalloc (sizeof(*p));
355 p->str = xmalloc (len+1);
356 memcpy (p->str, buf, len);
361 static struct DFA *lexSpecDFA (void)
366 dfa_parse_cmap_del (dfa, ' ');
367 dfa_parse_cmap_del (dfa, '\t');
368 dfa_parse_cmap_add (dfa, '/', 0);
372 static void actionListDel (struct lexRuleAction **rap)
374 struct lexRuleAction *ra1, *ra;
376 for (ra = *rap; ra; ra = ra1)
382 dfa_delete (&ra->u.pattern.dfa);
385 regxCodeDel (&ra->u.code);
393 static struct lexContext *lexContextCreate (const char *name)
395 struct lexContext *p = xmalloc (sizeof(*p));
397 p->name = xstrdup (name);
400 p->dfa = lexSpecDFA ();
403 p->beginActionList = NULL;
404 p->endActionList = NULL;
405 p->initActionList = NULL;
410 static void lexContextDestroy (struct lexContext *p)
412 struct lexRule *rp, *rp1;
415 for (rp = p->rules; rp; rp = rp1)
418 actionListDel (&rp->info.actionList);
421 actionListDel (&p->beginActionList);
422 actionListDel (&p->endActionList);
427 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
432 p = xmalloc (sizeof(*p));
433 p->name = xmalloc (strlen(name)+1);
434 strcpy (p->name, name);
441 p->context_stack_size = 100;
442 p->context_stack = xmalloc (sizeof(*p->context_stack) *
443 p->context_stack_size);
447 p->concatBuf = xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
448 for (i = 0; i < p->maxLevel; i++)
450 p->concatBuf[i] = xmalloc (sizeof(**p->concatBuf));
451 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
452 p->concatBuf[i]->buf = 0;
454 p->d1_stack = xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
459 static void lexSpecDestroy (struct lexSpec **pp)
462 struct lexContext *lt;
470 for (i = 0; i < p->maxLevel; i++)
471 xfree (p->concatBuf[i]);
472 xfree (p->concatBuf);
477 struct lexContext *lt_next = lt->next;
478 lexContextDestroy (lt);
483 Tcl_DeleteInterp (p->tcl_interp);
486 xfree (p->f_win_buf);
487 xfree (p->context_stack);
493 static int readParseToken (const char **cpp, int *len)
495 const char *cp = *cpp;
499 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
528 if (*cp >= 'a' && *cp <= 'z')
530 else if (*cp >= 'A' && *cp <= 'Z')
531 cmd[i] = *cp + 'a' - 'A';
534 if (i < sizeof(cmd)-2)
541 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
543 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
549 if (!strcmp (cmd, "begin"))
551 else if (!strcmp (cmd, "end"))
553 else if (!strcmp (cmd, "body"))
555 else if (!strcmp (cmd, "context"))
557 else if (!strcmp (cmd, "init"))
561 logf (LOG_WARN, "bad command %s", cmd);
567 static int actionListMk (struct lexSpec *spec, const char *s,
568 struct lexRuleAction **ap)
574 while ((tok = readParseToken (&s, &len)))
582 *ap = xmalloc (sizeof(**ap));
584 regxCodeMk (&(*ap)->u.code, s, len);
588 *ap = xmalloc (sizeof(**ap));
590 (*ap)->u.pattern.body = bodyMark;
592 (*ap)->u.pattern.dfa = lexSpecDFA ();
594 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
599 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
602 dfa_mkstate ((*ap)->u.pattern.dfa);
606 logf (LOG_WARN, "cannot use BEGIN here");
609 logf (LOG_WARN, "cannot use INIT here");
612 *ap = xmalloc (sizeof(**ap));
622 int readOneSpec (struct lexSpec *spec, const char *s)
626 struct lexContext *lc;
628 tok = readParseToken (&s, &len);
629 if (tok == REGX_CONTEXT)
631 char context_name[32];
632 tok = readParseToken (&s, &len);
633 if (tok != REGX_CODE)
635 logf (LOG_WARN, "missing name after CONTEXT keyword");
640 memcpy (context_name, s, len);
641 context_name[len] = '\0';
642 lc = lexContextCreate (context_name);
643 lc->next = spec->context;
648 spec->context = lexContextCreate ("main");
653 actionListDel (&spec->context->beginActionList);
654 actionListMk (spec, s, &spec->context->beginActionList);
657 actionListDel (&spec->context->endActionList);
658 actionListMk (spec, s, &spec->context->endActionList);
661 actionListDel (&spec->context->initActionList);
662 actionListMk (spec, s, &spec->context->initActionList);
666 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
668 r = dfa_parse (spec->context->dfa, &s);
671 logf (LOG_WARN, "regular expression error. r=%d", r);
676 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
680 rp = xmalloc (sizeof(*rp));
681 rp->info.no = spec->context->ruleNo++;
682 rp->next = spec->context->rules;
683 spec->context->rules = rp;
684 actionListMk (spec, s, &rp->info.actionList);
689 int readFileSpec (struct lexSpec *spec)
691 struct lexContext *lc;
694 int c, i, errors = 0;
697 lineBuf = xmalloc (1+lineSize);
699 if (spec->tcl_interp)
701 sprintf (lineBuf, "%s.tflt", spec->name);
702 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
707 sprintf (lineBuf, "%s.flt", spec->name);
708 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
712 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
716 logf (LOG_LOG, "reading regx filter %s", lineBuf);
718 if (spec->tcl_interp)
719 logf (LOG_LOG, "Tcl enabled");
726 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
728 while (c != '\n' && c != EOF)
747 if (c != ' ' && c != '\t')
756 readOneSpec (spec, lineBuf);
757 spec->lineNo += addLine;
766 debug_dfa_followpos = 1;
769 for (lc = spec->context; lc; lc = lc->next)
772 lc->fastRule = xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
773 for (i = 0; i < lc->ruleNo; i++)
774 lc->fastRule[i] = NULL;
775 for (rp = lc->rules; rp; rp = rp->next)
776 lc->fastRule[rp->info.no] = &rp->info;
777 dfa_mkstate (lc->dfa);
786 static struct lexSpec *curLexSpec = NULL;
789 static void execData (struct lexSpec *spec,
790 const char *ebuf, int elen, int formatted_text)
792 struct data1_node *res, *parent;
795 if (elen == 0) /* shouldn't happen, but it does! */
799 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
800 ebuf, 15, ebuf + elen-15);
802 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
804 logf (LOG_DEBUG, "data (%d bytes)", elen);
807 if (spec->d1_level <= 1)
810 parent = spec->d1_stack[spec->d1_level -1];
813 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
814 org_len = res->u.data.len;
819 res = data1_mk_node (spec->dh, spec->m);
820 res->parent = parent;
821 res->which = DATA1N_data;
822 res->u.data.what = DATA1I_text;
824 res->u.data.formatted_text = formatted_text;
826 if (elen > DATA1_LOCALDATA)
827 res->u.data.data = nmem_malloc (spec->m, elen);
829 res->u.data.data = res->lbuf;
830 memcpy (res->u.data.data, ebuf, elen);
832 res->u.data.data = 0;
834 res->root = parent->root;
836 parent->last_child = res;
837 if (spec->d1_stack[spec->d1_level])
838 spec->d1_stack[spec->d1_level]->next = res;
841 spec->d1_stack[spec->d1_level] = res;
843 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
845 char *old_buf, *new_buf;
847 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
848 new_buf = xmalloc (spec->concatBuf[spec->d1_level]->max);
849 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
851 memcpy (new_buf, old_buf, org_len);
854 spec->concatBuf[spec->d1_level]->buf = new_buf;
856 assert (spec->concatBuf[spec->d1_level]);
857 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
858 res->u.data.len += elen;
861 static void execDataP (struct lexSpec *spec,
862 const char *ebuf, int elen, int formatted_text)
864 execData (spec, ebuf, elen, formatted_text);
867 static void tagDataRelease (struct lexSpec *spec)
871 if ((res = spec->d1_stack[spec->d1_level]) &&
872 res->which == DATA1N_data &&
873 res->u.data.what == DATA1I_text)
875 assert (!res->u.data.data);
876 assert (res->u.data.len > 0);
877 if (res->u.data.len > DATA1_LOCALDATA)
878 res->u.data.data = nmem_malloc (spec->m, res->u.data.len);
880 res->u.data.data = res->lbuf;
881 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
886 static void variantBegin (struct lexSpec *spec,
887 const char *class_str, int class_len,
888 const char *type_str, int type_len,
889 const char *value_str, int value_len)
891 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
892 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
897 if (spec->d1_level == 0)
899 logf (LOG_WARN, "in variant begin. No record type defined");
902 if (class_len >= DATA1_MAX_SYMBOL)
903 class_len = DATA1_MAX_SYMBOL-1;
904 memcpy (tclass, class_str, class_len);
905 tclass[class_len] = '\0';
907 if (type_len >= DATA1_MAX_SYMBOL)
908 type_len = DATA1_MAX_SYMBOL-1;
909 memcpy (ttype, type_str, type_len);
910 ttype[type_len] = '\0';
913 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
918 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
922 if (parent->which != DATA1N_variant)
924 res = data1_mk_node (spec->dh, spec->m);
925 res->parent = parent;
926 res->which = DATA1N_variant;
927 res->u.variant.type = 0;
928 res->u.variant.value = 0;
929 res->root = parent->root;
931 parent->last_child = res;
932 if (spec->d1_stack[spec->d1_level])
934 tagDataRelease (spec);
935 spec->d1_stack[spec->d1_level]->next = res;
939 spec->d1_stack[spec->d1_level] = res;
940 spec->d1_stack[++(spec->d1_level)] = NULL;
942 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
943 if (spec->d1_stack[i]->u.variant.type == tp)
950 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
952 parent = spec->d1_stack[spec->d1_level-1];
953 res = data1_mk_node (spec->dh, spec->m);
954 res->parent = parent;
955 res->which = DATA1N_variant;
956 res->root = parent->root;
957 res->u.variant.type = tp;
959 if (value_len >= DATA1_LOCALDATA)
960 value_len =DATA1_LOCALDATA-1;
961 memcpy (res->lbuf, value_str, value_len);
962 res->lbuf[value_len] = '\0';
964 res->u.variant.value = res->lbuf;
966 parent->last_child = res;
967 if (spec->d1_stack[spec->d1_level])
969 tagDataRelease (spec);
970 spec->d1_stack[spec->d1_level]->next = res;
974 spec->d1_stack[spec->d1_level] = res;
975 spec->d1_stack[++(spec->d1_level)] = NULL;
978 static void tagStrip (const char **tag, int *len)
982 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
985 for (i = 0; i < *len && isspace((*tag)[i]); i++)
991 static void tagBegin (struct lexSpec *spec,
992 const char *tag, int len)
994 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
995 data1_element *elem = NULL;
996 data1_node *partag = get_parent_tag(spec->dh, parent);
998 data1_element *e = NULL;
1001 if (spec->d1_level == 0)
1003 logf (LOG_WARN, "in element begin. No record type defined");
1006 tagStrip (&tag, &len);
1008 res = data1_mk_node (spec->dh, spec->m);
1009 res->parent = parent;
1010 res->which = DATA1N_tag;
1011 res->u.tag.get_bytes = -1;
1013 if (len >= DATA1_LOCALDATA)
1014 res->u.tag.tag = nmem_malloc (spec->m, len+1);
1016 res->u.tag.tag = res->lbuf;
1018 memcpy (res->u.tag.tag, tag, len);
1019 res->u.tag.tag[len] = '\0';
1022 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1024 if (parent->which == DATA1N_variant)
1027 if (!(e = partag->u.tag.element))
1030 elem = data1_getelementbytagname (spec->dh,
1031 spec->d1_stack[0]->u.root.absyn,
1033 res->u.tag.element = elem;
1034 res->u.tag.node_selected = 0;
1035 res->u.tag.make_variantlist = 0;
1036 res->u.tag.no_data_requested = 0;
1037 res->root = parent->root;
1039 parent->last_child = res;
1040 if (spec->d1_stack[spec->d1_level])
1042 tagDataRelease (spec);
1043 spec->d1_stack[spec->d1_level]->next = res;
1046 parent->child = res;
1047 spec->d1_stack[spec->d1_level] = res;
1048 spec->d1_stack[++(spec->d1_level)] = NULL;
1051 static void tagEnd (struct lexSpec *spec, int min_level,
1052 const char *tag, int len)
1054 tagStrip (&tag, &len);
1055 while (spec->d1_level > min_level)
1057 tagDataRelease (spec);
1059 if (spec->d1_level == 0)
1061 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1063 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1065 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1069 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1074 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1077 struct DFA_state *state = dfa->states[0];
1080 unsigned char c_prev = 0;
1081 int ptr = *pptr; /* current pointer */
1082 int start_ptr = *pptr; /* first char of match */
1083 int last_ptr = 0; /* last char of match */
1084 int last_rule = 0; /* rule number of current match */
1089 c = f_win_advance (spec, &ptr);
1090 if (ptr == F_WIN_EOF)
1107 *mptr = start_ptr; /* match starts here */
1108 *pptr = last_ptr; /* match end here (+1) */
1111 state = dfa->states[0];
1116 else if (c >= t->ch[0] && c <= t->ch[1])
1118 state = dfa->states[t->to];
1123 last_rule = state->rule_no;
1128 last_rule = state->rule_nno;
1140 static int execTok (struct lexSpec *spec, const char **src,
1141 const char **tokBuf, int *tokLen)
1143 const char *s = *src;
1145 while (*s == ' ' || *s == '\t')
1149 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1153 while (*s >= '0' && *s <= '9')
1154 n = n*10 + (*s++ -'0');
1155 if (spec->arg_no == 0)
1162 if (n >= spec->arg_no)
1164 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1168 else if (*s == '\"')
1171 while (*s && *s != '\"')
1173 *tokLen = s - *tokBuf;
1178 else if (*s == '\n' || *s == ';')
1186 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1188 *tokLen = s - *tokBuf;
1195 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1197 *tokLen = s - *tokBuf;
1203 static char *regxStrz (const char *src, int len, char *str)
1207 memcpy (str, src, len);
1213 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1214 int argc, char **argv)
1216 struct lexSpec *spec = clientData;
1219 if (!strcmp(argv[1], "record") && argc == 3)
1221 char *absynName = argv[2];
1225 logf (LOG_DEBUG, "begin record %s", absynName);
1227 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1228 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1233 res = data1_mk_node (spec->dh, spec->m);
1234 res->which = DATA1N_root;
1235 res->u.root.type = absynName;
1236 res->u.root.absyn = absyn;
1239 spec->d1_stack[spec->d1_level] = res;
1240 spec->d1_stack[++(spec->d1_level)] = NULL;
1243 else if (!strcmp(argv[1], "element") && argc == 3)
1245 tagBegin (spec, argv[2], strlen(argv[2]));
1247 else if (!strcmp (argv[1], "variant") && argc == 5)
1249 variantBegin (spec, argv[2], strlen(argv[2]),
1250 argv[3], strlen(argv[3]),
1251 argv[4], strlen(argv[4]));
1253 else if (!strcmp (argv[1], "context") && argc == 3)
1255 struct lexContext *lc = spec->context;
1257 logf (LOG_DEBUG, "begin context %s",argv[2]);
1259 while (lc && strcmp (argv[2], lc->name))
1263 spec->context_stack[++(spec->context_stack_top)] = lc;
1266 logf (LOG_WARN, "unknown context %s", argv[2]);
1273 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1274 int argc, char **argv)
1276 struct lexSpec *spec = clientData;
1280 if (!strcmp (argv[1], "record"))
1282 while (spec->d1_level)
1284 tagDataRelease (spec);
1288 logf (LOG_DEBUG, "end record");
1290 spec->stop_flag = 1;
1292 else if (!strcmp (argv[1], "element"))
1296 if (argc >= 3 && !strcmp(argv[2], "-record"))
1305 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1306 if (spec->d1_level == 0)
1309 logf (LOG_DEBUG, "end element end records");
1311 spec->stop_flag = 1;
1314 else if (!strcmp (argv[1], "context"))
1317 logf (LOG_DEBUG, "end context");
1319 if (spec->context_stack_top)
1320 (spec->context_stack_top)--;
1327 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1328 int argc, char **argv)
1332 const char *element = 0;
1333 struct lexSpec *spec = clientData;
1337 if (!strcmp("-text", argv[argi]))
1342 else if (!strcmp("-element", argv[argi]))
1346 element = argv[argi++];
1352 tagBegin (spec, element, strlen(element));
1356 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1360 tagEnd (spec, 1, NULL, 0);
1364 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1365 int argc, char **argv)
1367 struct lexSpec *spec = clientData;
1374 if (!strcmp("-offset", argv[argi]))
1379 offset = atoi(argv[argi]);
1388 no = atoi(argv[argi]);
1389 if (no >= spec->arg_no)
1390 no = spec->arg_no - 1;
1391 spec->ptr = spec->arg_start[no] + offset;
1395 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1398 for (i = 0; i < spec->arg_no; i++)
1400 char var_name[10], *var_buf;
1403 sprintf (var_name, "%d", i);
1404 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1408 ch = var_buf[var_len];
1409 var_buf[var_len] = '\0';
1410 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1411 var_buf[var_len] = ch;
1414 Tcl_Eval (spec->tcl_interp, code->str);
1419 static void execCode (struct lexSpec *spec, struct regxCode *code)
1421 const char *s = code->str;
1423 const char *cmd_str;
1425 r = execTok (spec, &s, &cmd_str, &cmd_len);
1432 r = execTok (spec, &s, &cmd_str, &cmd_len);
1435 p = regxStrz (cmd_str, cmd_len, ptmp);
1436 if (!strcmp (p, "begin"))
1438 r = execTok (spec, &s, &cmd_str, &cmd_len);
1441 logf (LOG_WARN, "missing keyword after 'begin'");
1444 p = regxStrz (cmd_str, cmd_len, ptmp);
1445 if (!strcmp (p, "record"))
1447 r = execTok (spec, &s, &cmd_str, &cmd_len);
1450 if (spec->d1_level == 0)
1452 static char absynName[64];
1457 memcpy (absynName, cmd_str, cmd_len);
1458 absynName[cmd_len] = '\0';
1461 logf (LOG_DEBUG, "begin record %s", absynName);
1463 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1464 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1469 res = data1_mk_node (spec->dh, spec->m);
1470 res->which = DATA1N_root;
1471 res->u.root.type = absynName;
1472 res->u.root.absyn = absyn;
1475 spec->d1_stack[spec->d1_level] = res;
1476 spec->d1_stack[++(spec->d1_level)] = NULL;
1479 r = execTok (spec, &s, &cmd_str, &cmd_len);
1481 else if (!strcmp (p, "element"))
1483 r = execTok (spec, &s, &cmd_str, &cmd_len);
1486 tagBegin (spec, cmd_str, cmd_len);
1487 r = execTok (spec, &s, &cmd_str, &cmd_len);
1489 else if (!strcmp (p, "variant"))
1492 const char *class_str = NULL;
1494 const char *type_str = NULL;
1496 const char *value_str = NULL;
1497 r = execTok (spec, &s, &cmd_str, &cmd_len);
1500 class_str = cmd_str;
1501 class_len = cmd_len;
1502 r = execTok (spec, &s, &cmd_str, &cmd_len);
1508 r = execTok (spec, &s, &cmd_str, &cmd_len);
1511 value_str = cmd_str;
1512 value_len = cmd_len;
1514 variantBegin (spec, class_str, class_len,
1515 type_str, type_len, value_str, value_len);
1518 r = execTok (spec, &s, &cmd_str, &cmd_len);
1520 else if (!strcmp (p, "context"))
1524 struct lexContext *lc = spec->context;
1525 r = execTok (spec, &s, &cmd_str, &cmd_len);
1526 p = regxStrz (cmd_str, cmd_len, ptmp);
1528 logf (LOG_DEBUG, "begin context %s", p);
1530 while (lc && strcmp (p, lc->name))
1533 spec->context_stack[++(spec->context_stack_top)] = lc;
1535 logf (LOG_WARN, "unknown context %s", p);
1538 r = execTok (spec, &s, &cmd_str, &cmd_len);
1542 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1545 else if (!strcmp (p, "end"))
1547 r = execTok (spec, &s, &cmd_str, &cmd_len);
1550 logf (LOG_WARN, "missing keyword after 'end'");
1553 p = regxStrz (cmd_str, cmd_len, ptmp);
1554 if (!strcmp (p, "record"))
1556 while (spec->d1_level)
1558 tagDataRelease (spec);
1561 r = execTok (spec, &s, &cmd_str, &cmd_len);
1563 logf (LOG_DEBUG, "end record");
1565 spec->stop_flag = 1;
1567 else if (!strcmp (p, "element"))
1570 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1572 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1577 tagEnd (spec, min_level, cmd_str, cmd_len);
1578 r = execTok (spec, &s, &cmd_str, &cmd_len);
1581 tagEnd (spec, min_level, NULL, 0);
1582 if (spec->d1_level == 0)
1585 logf (LOG_DEBUG, "end element end records");
1587 spec->stop_flag = 1;
1591 else if (!strcmp (p, "context"))
1594 logf (LOG_DEBUG, "end context");
1596 if (spec->context_stack_top)
1597 (spec->context_stack_top)--;
1598 r = execTok (spec, &s, &cmd_str, &cmd_len);
1601 logf (LOG_WARN, "bad keyword '%s' after end", p);
1603 else if (!strcmp (p, "data"))
1607 const char *element_str = NULL;
1609 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1611 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1613 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1615 r = execTok (spec, &s, &element_str, &element_len);
1620 logf (LOG_WARN, "bad data option: %.*s",
1625 logf (LOG_WARN, "missing data item after data");
1629 tagBegin (spec, element_str, element_len);
1632 execData (spec, cmd_str, cmd_len,textFlag);
1633 r = execTok (spec, &s, &cmd_str, &cmd_len);
1636 tagEnd (spec, 1, NULL, 0);
1638 else if (!strcmp (p, "unread"))
1641 r = execTok (spec, &s, &cmd_str, &cmd_len);
1642 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1644 r = execTok (spec, &s, &cmd_str, &cmd_len);
1647 logf (LOG_WARN, "missing number after -offset");
1650 p = regxStrz (cmd_str, cmd_len, ptmp);
1652 r = execTok (spec, &s, &cmd_str, &cmd_len);
1658 logf (LOG_WARN, "missing index after unread command");
1661 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1663 logf (LOG_WARN, "bad index after unread command");
1668 no = *cmd_str - '0';
1669 if (no >= spec->arg_no)
1670 no = spec->arg_no - 1;
1671 spec->ptr = spec->arg_start[no] + offset;
1673 r = execTok (spec, &s, &cmd_str, &cmd_len);
1675 else if (!strcmp (p, "context"))
1679 struct lexContext *lc = spec->context;
1680 r = execTok (spec, &s, &cmd_str, &cmd_len);
1681 p = regxStrz (cmd_str, cmd_len, ptmp);
1683 while (lc && strcmp (p, lc->name))
1686 spec->context_stack[spec->context_stack_top] = lc;
1688 logf (LOG_WARN, "unknown context %s", p);
1691 r = execTok (spec, &s, &cmd_str, &cmd_len);
1695 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1696 r = execTok (spec, &s, &cmd_str, &cmd_len);
1701 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1703 r = execTok (spec, &s, &cmd_str, &cmd_len);
1710 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1711 int start_ptr, int *pptr)
1720 arg_start[0] = start_ptr;
1722 spec->arg_start = arg_start;
1723 spec->arg_end = arg_end;
1730 if (ap->u.pattern.body)
1732 arg_start[arg_no] = *pptr;
1733 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1735 arg_end[arg_no] = F_WIN_EOF;
1737 arg_start[arg_no] = F_WIN_EOF;
1738 arg_end[arg_no] = F_WIN_EOF;
1743 arg_end[arg_no] = sptr;
1745 arg_start[arg_no] = sptr;
1746 arg_end[arg_no] = *pptr;
1751 arg_start[arg_no] = *pptr;
1752 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1754 if (sptr != arg_start[arg_no])
1756 arg_end[arg_no] = *pptr;
1761 spec->arg_no = arg_no;
1764 if (spec->tcl_interp)
1765 execTcl(spec, ap->u.code);
1767 execCode (spec, ap->u.code);
1769 execCode (spec, ap->u.code);
1772 if (spec->stop_flag)
1776 arg_start[arg_no] = *pptr;
1777 arg_end[arg_no] = F_WIN_EOF;
1786 static int execRule (struct lexSpec *spec, struct lexContext *context,
1787 int ruleNo, int start_ptr, int *pptr)
1790 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1792 return execAction (spec, context->fastRule[ruleNo]->actionList,
1796 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1798 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1799 struct DFA_state *state = context->dfa->states[0];
1802 unsigned char c_prev = '\n';
1804 int last_rule = 0; /* rule number of current match */
1805 int last_ptr = *ptr; /* last char of match */
1806 int start_ptr = *ptr; /* first char of match */
1807 int skip_ptr = *ptr; /* first char of run */
1811 c = f_win_advance (spec, ptr);
1812 if (*ptr == F_WIN_EOF)
1814 /* end of file met */
1817 /* there was a match */
1818 if (skip_ptr < start_ptr)
1820 /* deal with chars that didn't match */
1823 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1824 execDataP (spec, buf, size, 0);
1826 /* restore pointer */
1829 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1831 /* restore skip pointer */
1835 else if (skip_ptr < *ptr)
1837 /* deal with chars that didn't match */
1840 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1841 execDataP (spec, buf, size, 0);
1843 if (*ptr == F_WIN_EOF)
1850 { /* no transition for character c ... */
1853 if (skip_ptr < start_ptr)
1855 /* deal with chars that didn't match */
1858 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1859 execDataP (spec, buf, size, 0);
1861 /* restore pointer */
1863 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1865 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1868 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1870 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1874 context = spec->context_stack[spec->context_stack_top];
1877 last_ptr = start_ptr = *ptr;
1881 c_prev = f_win_advance (spec, &start_ptr);
1886 c_prev = f_win_advance (spec, &start_ptr);
1889 state = context->dfa->states[0];
1892 else if (c >= t->ch[0] && c <= t->ch[1])
1893 { /* transition ... */
1894 state = context->dfa->states[t->to];
1899 last_rule = state->rule_no;
1902 else if (state->rule_nno)
1904 last_rule = state->rule_nno;
1916 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1917 const char *context_name)
1919 struct lexContext *lt = spec->context;
1922 spec->stop_flag = 0;
1924 spec->context_stack_top = 0;
1927 if (!strcmp (lt->name, context_name))
1933 logf (LOG_WARN, "cannot find context %s", context_name);
1936 spec->context_stack[spec->context_stack_top] = lt;
1937 spec->d1_stack[spec->d1_level] = NULL;
1942 execAction (spec, lt->initActionList, ptr, &ptr);
1945 execAction (spec, lt->beginActionList, ptr, &ptr);
1946 lexNode (spec, &ptr);
1947 while (spec->d1_level)
1949 tagDataRelease (spec);
1952 execAction (spec, lt->endActionList, ptr, &ptr);
1953 return spec->d1_stack[0];
1956 void grs_destroy(void *clientData)
1958 struct lexSpecs *specs = clientData;
1961 lexSpecDestroy(&specs->spec);
1966 void *grs_init(void)
1968 struct lexSpecs *specs = xmalloc (sizeof(*specs));
1973 data1_node *grs_read_regx (struct grs_read_info *p)
1976 struct lexSpecs *specs = p->clientData;
1977 struct lexSpec **curLexSpec = &specs->spec;
1980 logf (LOG_DEBUG, "grs_read_regx");
1982 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1985 lexSpecDestroy (curLexSpec);
1986 *curLexSpec = lexSpecCreate (p->type, p->dh);
1987 res = readFileSpec (*curLexSpec);
1990 lexSpecDestroy (curLexSpec);
1994 (*curLexSpec)->dh = p->dh;
1997 (*curLexSpec)->f_win_start = 0;
1998 (*curLexSpec)->f_win_end = 0;
1999 (*curLexSpec)->f_win_rf = p->readf;
2000 (*curLexSpec)->f_win_sf = p->seekf;
2001 (*curLexSpec)->f_win_fh = p->fh;
2002 (*curLexSpec)->f_win_ef = p->endf;
2003 (*curLexSpec)->f_win_size = 500000;
2005 (*curLexSpec)->m = p->mem;
2006 return lexRoot (*curLexSpec, p->offset, "main");
2009 static struct recTypeGrs regx_type = {
2016 RecTypeGrs recTypeGrs_regx = ®x_type;
2019 data1_node *grs_read_tcl (struct grs_read_info *p)
2022 struct lexSpecs *specs = p->clientData;
2023 struct lexSpec **curLexSpec = &specs->spec;
2026 logf (LOG_DEBUG, "grs_read_tcl");
2028 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2030 Tcl_Interp *tcl_interp;
2032 lexSpecDestroy (curLexSpec);
2033 *curLexSpec = lexSpecCreate (p->type, p->dh);
2034 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2035 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2036 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2037 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2038 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2040 res = readFileSpec (*curLexSpec);
2043 lexSpecDestroy (curLexSpec);
2047 (*curLexSpec)->dh = p->dh;
2050 (*curLexSpec)->f_win_start = 0;
2051 (*curLexSpec)->f_win_end = 0;
2052 (*curLexSpec)->f_win_rf = p->readf;
2053 (*curLexSpec)->f_win_sf = p->seekf;
2054 (*curLexSpec)->f_win_fh = p->fh;
2055 (*curLexSpec)->f_win_ef = p->endf;
2056 (*curLexSpec)->f_win_size = 500000;
2058 (*curLexSpec)->m = p->mem;
2059 return lexRoot (*curLexSpec, p->offset, "main");
2062 static struct recTypeGrs tcl_type = {
2069 RecTypeGrs recTypeGrs_tcl = &tcl_type;