2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.29 1999-07-12 07:27:54 adam
8 * Improved speed of Tcl processing. Fixed one memory leak.
10 * Revision 1.28 1999/07/06 12:26:04 adam
11 * Fixed filters so that MS-DOS CR is ignored.
13 * Revision 1.27 1999/06/28 13:25:40 quinn
14 * Improved diagnostics for Tcl
16 * Revision 1.26 1999/05/26 07:49:14 adam
19 * Revision 1.25 1999/05/25 12:33:32 adam
20 * Fixed bug in Tcl filter.
22 * Revision 1.24 1999/05/21 11:08:46 adam
23 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
24 * script so that it reads uninstalled Tcl source.
26 * Revision 1.23 1999/05/20 12:57:18 adam
27 * Implemented TCL filter. Updated recctrl system.
29 * Revision 1.22 1998/11/03 16:07:13 adam
32 * Revision 1.21 1998/11/03 15:43:39 adam
33 * Fixed bug introduced by previous commit.
35 * Revision 1.20 1998/11/03 14:51:28 adam
36 * Changed code so that it creates as few data1 nodes as possible.
38 * Revision 1.19 1998/11/03 10:22:39 adam
39 * Fixed memory leak that could occur for when large data1 node were
40 * concatenated. Data-type data1_nodes may have multiple nodes.
42 * Revision 1.18 1998/10/15 13:11:47 adam
43 * Added support for option -record for "end element". When specified
44 * end element will mark end-of-record when at outer-level.
46 * Revision 1.17 1998/07/01 10:13:51 adam
49 * Revision 1.16 1998/06/30 15:15:09 adam
50 * Tags are trimmed: white space removed before- and after the tag.
52 * Revision 1.15 1998/06/30 12:55:45 adam
55 * Revision 1.14 1998/03/05 08:41:00 adam
56 * Implemented rule contexts.
58 * Revision 1.13 1997/12/12 06:33:58 adam
59 * Fixed bug that showed up when multiple filter where used.
60 * Made one routine thread-safe.
62 * Revision 1.12 1997/11/18 10:03:24 adam
63 * Member num_children removed from data1_node.
65 * Revision 1.11 1997/11/06 11:41:01 adam
66 * Implemented "begin variant" for the sgml.regx filter.
68 * Revision 1.10 1997/10/31 12:36:12 adam
69 * Minor change that avoids compiler warning.
71 * Revision 1.9 1997/09/29 09:02:49 adam
72 * Fixed small bug (introduced by previous commit).
74 * Revision 1.8 1997/09/17 12:19:22 adam
75 * Zebra version corresponds to YAZ version 1.4.
76 * Changed Zebra server so that it doesn't depend on global common_resource.
78 * Revision 1.7 1997/07/15 16:33:07 adam
79 * Check for zero length in execData.
81 * Revision 1.6 1997/02/24 10:41:51 adam
82 * Cleanup of code and commented out the "end element-end-record" code.
84 * Revision 1.5 1997/02/19 16:22:33 adam
85 * Fixed "end element" to terminate record in outer-most level.
87 * Revision 1.4 1997/02/12 20:42:58 adam
88 * Changed some log messages.
90 * Revision 1.3 1996/11/08 14:05:33 adam
91 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
93 * Revision 1.2 1996/10/29 14:02:09 adam
94 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
95 * data1_get_tabpath is used.
97 * Revision 1.1 1996/10/11 10:57:30 adam
98 * New module recctrl. Used to manage records (extract/retrieval).
100 * Revision 1.24 1996/06/17 14:25:31 adam
101 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
103 * Revision 1.23 1996/06/04 10:19:00 adam
104 * Minor changes - removed include of ctype.h.
106 * Revision 1.22 1996/06/03 15:23:13 adam
107 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
109 * Revision 1.21 1996/05/14 16:58:38 adam
112 * Revision 1.20 1996/05/01 13:46:36 adam
113 * First work on multiple records in one file.
114 * New option, -offset, to the "unread" command in the filter module.
116 * Revision 1.19 1996/02/12 16:18:20 adam
117 * Yet another bug fix in implementation of unread command.
119 * Revision 1.18 1996/02/12 16:07:54 adam
120 * Bug fix in new unread command.
122 * Revision 1.17 1996/02/12 15:56:11 adam
123 * New code command: unread.
125 * Revision 1.16 1996/01/17 14:57:51 adam
126 * Prototype changed for reader functions in extract/retrieve. File
127 * is identified by 'void *' instead of 'int.
129 * Revision 1.15 1996/01/08 19:15:47 adam
130 * New input filter that works!
132 * Revision 1.14 1996/01/08 09:10:38 adam
133 * Yet another complete rework on this module.
135 * Revision 1.13 1995/12/15 17:21:50 adam
136 * This version is able to set data.formatted_text in data1-nodes.
138 * Revision 1.12 1995/12/15 16:20:10 adam
139 * The filter files (*.flt) are read from the path given by data1_tabpath.
141 * Revision 1.11 1995/12/15 12:35:16 adam
144 * Revision 1.10 1995/12/15 10:35:36 adam
147 * Revision 1.9 1995/12/14 16:38:48 adam
148 * Completely new attempt to make regular expression parsing.
150 * Revision 1.8 1995/12/13 17:16:59 adam
153 * Revision 1.7 1995/12/13 16:51:58 adam
154 * Modified to set last_child in data1_nodes.
155 * Uses destroy handler to free up data text nodes.
157 * Revision 1.6 1995/12/13 13:45:37 quinn
158 * Changed data1 to use nmem.
160 * Revision 1.5 1995/12/11 09:12:52 adam
161 * The rec_get function returns NULL if record doesn't exist - will
162 * happen in the server if the result set records have been deleted since
163 * the creation of the set (i.e. the search).
164 * The server saves a result temporarily if it is 'volatile', i.e. the
165 * set is register dependent.
167 * Revision 1.4 1995/12/05 16:57:40 adam
168 * More work on regular patterns.
170 * Revision 1.3 1995/12/05 09:37:09 adam
171 * One malloc was renamed to xmalloc.
173 * Revision 1.2 1995/12/04 17:59:24 adam
174 * More work on regular expression conversion.
176 * Revision 1.1 1995/12/04 14:25:30 adam
177 * Started work on regular expression parsed input to structured records.
186 #include <zebrautl.h>
196 #define F_WIN_EOF 2000000000
200 #define REGX_PATTERN 1
205 #define REGX_CONTEXT 6
215 struct lexRuleAction {
219 struct DFA *dfa; /* REGX_PATTERN */
222 struct regxCode *code; /* REGX_CODE */
224 struct lexRuleAction *next;
229 struct lexRuleAction *actionList;
233 struct lexRuleInfo info;
234 struct lexRule *next;
240 struct lexRule *rules;
241 struct lexRuleInfo **fastRule;
245 struct lexRuleAction *beginActionList;
246 struct lexRuleAction *endActionList;
247 struct lexRuleAction *initActionList;
248 struct lexContext *next;
251 struct lexConcatBuf {
259 struct lexContext *context;
261 struct lexContext **context_stack;
262 int context_stack_size;
263 int context_stack_top;
269 Tcl_Interp *tcl_interp;
272 void (*f_win_ef)(void *, off_t);
274 int f_win_start; /* first byte of buffer is this file offset */
275 int f_win_end; /* last byte of buffer is this offset - 1 */
276 int f_win_size; /* size of buffer */
277 char *f_win_buf; /* buffer itself */
278 int (*f_win_rf)(void *, char *, size_t);
279 off_t (*f_win_sf)(void *, off_t);
281 struct lexConcatBuf **concatBuf;
283 data1_node **d1_stack;
294 struct lexSpec *spec;
297 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
300 int i, r, off = start_pos - spec->f_win_start;
302 if (off >= 0 && end_pos <= spec->f_win_end)
304 *size = end_pos - start_pos;
305 return spec->f_win_buf + off;
307 if (off < 0 || start_pos >= spec->f_win_end)
309 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
310 spec->f_win_start = start_pos;
312 if (!spec->f_win_buf)
313 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
314 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
316 spec->f_win_end = spec->f_win_start + *size;
318 if (*size > end_pos - start_pos)
319 *size = end_pos - start_pos;
320 return spec->f_win_buf;
322 for (i = 0; i<spec->f_win_end - start_pos; i++)
323 spec->f_win_buf[i] = spec->f_win_buf[i + off];
324 r = (*spec->f_win_rf)(spec->f_win_fh,
326 spec->f_win_size - i);
327 spec->f_win_start = start_pos;
328 spec->f_win_end += r;
330 if (*size > end_pos - start_pos)
331 *size = end_pos - start_pos;
332 return spec->f_win_buf;
335 static int f_win_advance (struct lexSpec *spec, int *pos)
340 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
341 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
342 if (*pos == F_WIN_EOF)
344 buf = f_win_get (spec, *pos, *pos+1, &size);
354 static void regxCodeDel (struct regxCode **pp)
356 struct regxCode *p = *pp;
361 Tcl_DecrRefCount (p->tcl_obj);
369 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
373 p = (struct regxCode *) xmalloc (sizeof(*p));
374 p->str = (char *) xmalloc (len+1);
375 memcpy (p->str, buf, len);
378 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
380 Tcl_IncrRefCount (p->tcl_obj);
385 static struct DFA *lexSpecDFA (void)
390 dfa_parse_cmap_del (dfa, ' ');
391 dfa_parse_cmap_del (dfa, '\t');
392 dfa_parse_cmap_add (dfa, '/', 0);
396 static void actionListDel (struct lexRuleAction **rap)
398 struct lexRuleAction *ra1, *ra;
400 for (ra = *rap; ra; ra = ra1)
406 dfa_delete (&ra->u.pattern.dfa);
409 regxCodeDel (&ra->u.code);
417 static struct lexContext *lexContextCreate (const char *name)
419 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
421 p->name = xstrdup (name);
424 p->dfa = lexSpecDFA ();
427 p->beginActionList = NULL;
428 p->endActionList = NULL;
429 p->initActionList = NULL;
434 static void lexContextDestroy (struct lexContext *p)
436 struct lexRule *rp, *rp1;
438 dfa_delete (&p->dfa);
440 for (rp = p->rules; rp; rp = rp1)
443 actionListDel (&rp->info.actionList);
446 actionListDel (&p->beginActionList);
447 actionListDel (&p->endActionList);
448 actionListDel (&p->initActionList);
453 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
458 p = (struct lexSpec *) xmalloc (sizeof(*p));
459 p->name = (char *) xmalloc (strlen(name)+1);
460 strcpy (p->name, name);
467 p->context_stack_size = 100;
468 p->context_stack = (struct lexContext **)
469 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
473 p->concatBuf = (struct lexConcatBuf **)
474 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
475 for (i = 0; i < p->maxLevel; i++)
477 p->concatBuf[i] = (struct lexConcatBuf *)
478 xmalloc (sizeof(**p->concatBuf));
479 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
480 p->concatBuf[i]->buf = 0;
482 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
487 static void lexSpecDestroy (struct lexSpec **pp)
490 struct lexContext *lt;
498 for (i = 0; i < p->maxLevel; i++)
499 xfree (p->concatBuf[i]);
500 xfree (p->concatBuf);
505 struct lexContext *lt_next = lt->next;
506 lexContextDestroy (lt);
511 Tcl_DeleteInterp (p->tcl_interp);
514 xfree (p->f_win_buf);
515 xfree (p->context_stack);
521 static int readParseToken (const char **cpp, int *len)
523 const char *cp = *cpp;
527 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
556 if (*cp >= 'a' && *cp <= 'z')
558 else if (*cp >= 'A' && *cp <= 'Z')
559 cmd[i] = *cp + 'a' - 'A';
562 if (i < (int) sizeof(cmd)-2)
569 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
571 while (*cp && *cp != ' ' && *cp != '\t' &&
572 *cp != '\n' && *cp != '\r')
578 if (!strcmp (cmd, "begin"))
580 else if (!strcmp (cmd, "end"))
582 else if (!strcmp (cmd, "body"))
584 else if (!strcmp (cmd, "context"))
586 else if (!strcmp (cmd, "init"))
590 logf (LOG_WARN, "bad command %s", cmd);
596 static int actionListMk (struct lexSpec *spec, const char *s,
597 struct lexRuleAction **ap)
603 while ((tok = readParseToken (&s, &len)))
611 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
613 regxCodeMk (&(*ap)->u.code, s, len);
617 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
619 (*ap)->u.pattern.body = bodyMark;
621 (*ap)->u.pattern.dfa = lexSpecDFA ();
623 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
628 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
631 dfa_mkstate ((*ap)->u.pattern.dfa);
635 logf (LOG_WARN, "cannot use BEGIN here");
638 logf (LOG_WARN, "cannot use INIT here");
641 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
651 int readOneSpec (struct lexSpec *spec, const char *s)
655 struct lexContext *lc;
657 tok = readParseToken (&s, &len);
658 if (tok == REGX_CONTEXT)
660 char context_name[32];
661 tok = readParseToken (&s, &len);
662 if (tok != REGX_CODE)
664 logf (LOG_WARN, "missing name after CONTEXT keyword");
669 memcpy (context_name, s, len);
670 context_name[len] = '\0';
671 lc = lexContextCreate (context_name);
672 lc->next = spec->context;
677 spec->context = lexContextCreate ("main");
682 actionListDel (&spec->context->beginActionList);
683 actionListMk (spec, s, &spec->context->beginActionList);
686 actionListDel (&spec->context->endActionList);
687 actionListMk (spec, s, &spec->context->endActionList);
690 actionListDel (&spec->context->initActionList);
691 actionListMk (spec, s, &spec->context->initActionList);
695 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
697 r = dfa_parse (spec->context->dfa, &s);
700 logf (LOG_WARN, "regular expression error. r=%d", r);
705 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
709 rp = (struct lexRule *) xmalloc (sizeof(*rp));
710 rp->info.no = spec->context->ruleNo++;
711 rp->next = spec->context->rules;
712 spec->context->rules = rp;
713 actionListMk (spec, s, &rp->info.actionList);
718 int readFileSpec (struct lexSpec *spec)
720 struct lexContext *lc;
721 int c, i, errors = 0;
727 if (spec->tcl_interp)
729 sprintf (fname, "%s.tflt", spec->name);
730 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
735 sprintf (fname, "%s.flt", spec->name);
736 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), fname, "r");
740 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
743 logf (LOG_LOG, "reading regx filter %s", fname);
745 if (spec->tcl_interp)
746 logf (LOG_LOG, "Tcl enabled");
748 lineBuf = wrbuf_alloc();
753 wrbuf_rewind (lineBuf);
754 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
756 while (c != '\n' && c != EOF)
769 wrbuf_putc(lineBuf, c);
777 if (c != ' ' && c != '\t')
782 wrbuf_putc(lineBuf, '\0');
783 readOneSpec (spec, wrbuf_buf(lineBuf));
784 spec->lineNo += addLine;
788 wrbuf_free(lineBuf, 1);
793 debug_dfa_followpos = 1;
796 for (lc = spec->context; lc; lc = lc->next)
799 lc->fastRule = (struct lexRuleInfo **)
800 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
801 for (i = 0; i < lc->ruleNo; i++)
802 lc->fastRule[i] = NULL;
803 for (rp = lc->rules; rp; rp = rp->next)
804 lc->fastRule[rp->info.no] = &rp->info;
805 dfa_mkstate (lc->dfa);
814 static struct lexSpec *curLexSpec = NULL;
817 static void execData (struct lexSpec *spec,
818 const char *ebuf, int elen, int formatted_text)
820 struct data1_node *res, *parent;
823 if (elen == 0) /* shouldn't happen, but it does! */
827 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
828 ebuf, 15, ebuf + elen-15);
830 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
832 logf (LOG_DEBUG, "data (%d bytes)", elen);
835 if (spec->d1_level <= 1)
838 parent = spec->d1_stack[spec->d1_level -1];
841 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
842 org_len = res->u.data.len;
847 res = data1_mk_node (spec->dh, spec->m);
848 res->parent = parent;
849 res->which = DATA1N_data;
850 res->u.data.what = DATA1I_text;
852 res->u.data.formatted_text = formatted_text;
854 if (elen > DATA1_LOCALDATA)
855 res->u.data.data = nmem_malloc (spec->m, elen);
857 res->u.data.data = res->lbuf;
858 memcpy (res->u.data.data, ebuf, elen);
860 res->u.data.data = 0;
862 res->root = parent->root;
864 parent->last_child = res;
865 if (spec->d1_stack[spec->d1_level])
866 spec->d1_stack[spec->d1_level]->next = res;
869 spec->d1_stack[spec->d1_level] = res;
871 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
873 char *old_buf, *new_buf;
875 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
876 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level]->max);
877 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
879 memcpy (new_buf, old_buf, org_len);
882 spec->concatBuf[spec->d1_level]->buf = new_buf;
884 assert (spec->concatBuf[spec->d1_level]);
885 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
886 res->u.data.len += elen;
889 static void execDataP (struct lexSpec *spec,
890 const char *ebuf, int elen, int formatted_text)
892 execData (spec, ebuf, elen, formatted_text);
895 static void tagDataRelease (struct lexSpec *spec)
899 if ((res = spec->d1_stack[spec->d1_level]) &&
900 res->which == DATA1N_data &&
901 res->u.data.what == DATA1I_text)
903 assert (!res->u.data.data);
904 assert (res->u.data.len > 0);
905 if (res->u.data.len > DATA1_LOCALDATA)
906 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
908 res->u.data.data = res->lbuf;
909 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
914 static void variantBegin (struct lexSpec *spec,
915 const char *class_str, int class_len,
916 const char *type_str, int type_len,
917 const char *value_str, int value_len)
919 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
920 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
925 if (spec->d1_level == 0)
927 logf (LOG_WARN, "in variant begin. No record type defined");
930 if (class_len >= DATA1_MAX_SYMBOL)
931 class_len = DATA1_MAX_SYMBOL-1;
932 memcpy (tclass, class_str, class_len);
933 tclass[class_len] = '\0';
935 if (type_len >= DATA1_MAX_SYMBOL)
936 type_len = DATA1_MAX_SYMBOL-1;
937 memcpy (ttype, type_str, type_len);
938 ttype[type_len] = '\0';
941 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
946 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
950 if (parent->which != DATA1N_variant)
952 res = data1_mk_node (spec->dh, spec->m);
953 res->parent = parent;
954 res->which = DATA1N_variant;
955 res->u.variant.type = 0;
956 res->u.variant.value = 0;
957 res->root = parent->root;
959 parent->last_child = res;
960 if (spec->d1_stack[spec->d1_level])
962 tagDataRelease (spec);
963 spec->d1_stack[spec->d1_level]->next = res;
967 spec->d1_stack[spec->d1_level] = res;
968 spec->d1_stack[++(spec->d1_level)] = NULL;
970 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
971 if (spec->d1_stack[i]->u.variant.type == tp)
978 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
980 parent = spec->d1_stack[spec->d1_level-1];
981 res = data1_mk_node (spec->dh, spec->m);
982 res->parent = parent;
983 res->which = DATA1N_variant;
984 res->root = parent->root;
985 res->u.variant.type = tp;
987 if (value_len >= DATA1_LOCALDATA)
988 value_len =DATA1_LOCALDATA-1;
989 memcpy (res->lbuf, value_str, value_len);
990 res->lbuf[value_len] = '\0';
992 res->u.variant.value = res->lbuf;
994 parent->last_child = res;
995 if (spec->d1_stack[spec->d1_level])
997 tagDataRelease (spec);
998 spec->d1_stack[spec->d1_level]->next = res;
1001 parent->child = res;
1002 spec->d1_stack[spec->d1_level] = res;
1003 spec->d1_stack[++(spec->d1_level)] = NULL;
1006 static void tagStrip (const char **tag, int *len)
1010 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
1013 for (i = 0; i < *len && isspace((*tag)[i]); i++)
1019 static void tagBegin (struct lexSpec *spec,
1020 const char *tag, int len)
1022 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1023 data1_element *elem = NULL;
1024 data1_node *partag = get_parent_tag(spec->dh, parent);
1026 data1_element *e = NULL;
1029 if (spec->d1_level == 0)
1031 logf (LOG_WARN, "in element begin. No record type defined");
1034 tagStrip (&tag, &len);
1036 res = data1_mk_node (spec->dh, spec->m);
1037 res->parent = parent;
1038 res->which = DATA1N_tag;
1039 res->u.tag.get_bytes = -1;
1041 if (len >= DATA1_LOCALDATA)
1042 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1044 res->u.tag.tag = res->lbuf;
1046 memcpy (res->u.tag.tag, tag, len);
1047 res->u.tag.tag[len] = '\0';
1050 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1052 if (parent->which == DATA1N_variant)
1055 if (!(e = partag->u.tag.element))
1058 elem = data1_getelementbytagname (spec->dh,
1059 spec->d1_stack[0]->u.root.absyn,
1061 res->u.tag.element = elem;
1062 res->u.tag.node_selected = 0;
1063 res->u.tag.make_variantlist = 0;
1064 res->u.tag.no_data_requested = 0;
1065 res->root = parent->root;
1067 parent->last_child = res;
1068 if (spec->d1_stack[spec->d1_level])
1070 tagDataRelease (spec);
1071 spec->d1_stack[spec->d1_level]->next = res;
1074 parent->child = res;
1075 spec->d1_stack[spec->d1_level] = res;
1076 spec->d1_stack[++(spec->d1_level)] = NULL;
1079 static void tagEnd (struct lexSpec *spec, int min_level,
1080 const char *tag, int len)
1082 tagStrip (&tag, &len);
1083 while (spec->d1_level > min_level)
1085 tagDataRelease (spec);
1087 if (spec->d1_level == 0)
1089 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1091 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1093 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1097 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1102 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1105 struct DFA_state *state = dfa->states[0];
1108 unsigned char c_prev = 0;
1109 int ptr = *pptr; /* current pointer */
1110 int start_ptr = *pptr; /* first char of match */
1111 int last_ptr = 0; /* last char of match */
1112 int last_rule = 0; /* rule number of current match */
1117 c = f_win_advance (spec, &ptr);
1118 if (ptr == F_WIN_EOF)
1135 *mptr = start_ptr; /* match starts here */
1136 *pptr = last_ptr; /* match end here (+1) */
1139 state = dfa->states[0];
1144 else if (c >= t->ch[0] && c <= t->ch[1])
1146 state = dfa->states[t->to];
1151 last_rule = state->rule_no;
1156 last_rule = state->rule_nno;
1168 static int execTok (struct lexSpec *spec, const char **src,
1169 const char **tokBuf, int *tokLen)
1171 const char *s = *src;
1173 while (*s == ' ' || *s == '\t')
1177 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1181 while (*s >= '0' && *s <= '9')
1182 n = n*10 + (*s++ -'0');
1183 if (spec->arg_no == 0)
1190 if (n >= spec->arg_no)
1192 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1196 else if (*s == '\"')
1199 while (*s && *s != '\"')
1201 *tokLen = s - *tokBuf;
1206 else if (*s == '\n' || *s == ';')
1214 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1217 *tokLen = s - *tokBuf;
1224 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1227 *tokLen = s - *tokBuf;
1233 static char *regxStrz (const char *src, int len, char *str)
1237 memcpy (str, src, len);
1243 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1244 int argc, char **argv)
1246 struct lexSpec *spec = (struct lexSpec *) clientData;
1249 if (!strcmp(argv[1], "record") && argc == 3)
1251 char *absynName = argv[2];
1255 logf (LOG_DEBUG, "begin record %s", absynName);
1257 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1258 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1263 res = data1_mk_node (spec->dh, spec->m);
1264 res->which = DATA1N_root;
1265 res->u.root.type = absynName;
1266 res->u.root.absyn = absyn;
1269 spec->d1_stack[spec->d1_level] = res;
1270 spec->d1_stack[++(spec->d1_level)] = NULL;
1273 else if (!strcmp(argv[1], "element") && argc == 3)
1275 tagBegin (spec, argv[2], strlen(argv[2]));
1277 else if (!strcmp (argv[1], "variant") && argc == 5)
1279 variantBegin (spec, argv[2], strlen(argv[2]),
1280 argv[3], strlen(argv[3]),
1281 argv[4], strlen(argv[4]));
1283 else if (!strcmp (argv[1], "context") && argc == 3)
1285 struct lexContext *lc = spec->context;
1287 logf (LOG_DEBUG, "begin context %s",argv[2]);
1289 while (lc && strcmp (argv[2], lc->name))
1293 spec->context_stack[++(spec->context_stack_top)] = lc;
1296 logf (LOG_WARN, "unknown context %s", argv[2]);
1303 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1304 int argc, char **argv)
1306 struct lexSpec *spec = (struct lexSpec *) clientData;
1310 if (!strcmp (argv[1], "record"))
1312 while (spec->d1_level)
1314 tagDataRelease (spec);
1318 logf (LOG_DEBUG, "end record");
1320 spec->stop_flag = 1;
1322 else if (!strcmp (argv[1], "element"))
1326 if (argc >= 3 && !strcmp(argv[2], "-record"))
1335 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1336 if (spec->d1_level == 0)
1339 logf (LOG_DEBUG, "end element end records");
1341 spec->stop_flag = 1;
1344 else if (!strcmp (argv[1], "context"))
1347 logf (LOG_DEBUG, "end context");
1349 if (spec->context_stack_top)
1350 (spec->context_stack_top)--;
1357 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1358 int argc, char **argv)
1362 const char *element = 0;
1363 struct lexSpec *spec = (struct lexSpec *) clientData;
1367 if (!strcmp("-text", argv[argi]))
1372 else if (!strcmp("-element", argv[argi]))
1376 element = argv[argi++];
1382 tagBegin (spec, element, strlen(element));
1386 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1390 tagEnd (spec, 1, NULL, 0);
1394 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1395 int argc, char **argv)
1397 struct lexSpec *spec = (struct lexSpec *) clientData;
1404 if (!strcmp("-offset", argv[argi]))
1409 offset = atoi(argv[argi]);
1418 no = atoi(argv[argi]);
1419 if (no >= spec->arg_no)
1420 no = spec->arg_no - 1;
1421 spec->ptr = spec->arg_start[no] + offset;
1425 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1429 for (i = 0; i < spec->arg_no; i++)
1431 char var_name[10], *var_buf;
1434 sprintf (var_name, "%d", i);
1435 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1439 ch = var_buf[var_len];
1440 var_buf[var_len] = '\0';
1441 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1442 var_buf[var_len] = ch;
1446 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1448 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1451 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1452 logf(LOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1453 spec->tcl_interp->errorLine,
1454 spec->tcl_interp->result,
1455 err ? err : "[NO ERRORINFO]");
1461 static void execCode (struct lexSpec *spec, struct regxCode *code)
1463 const char *s = code->str;
1465 const char *cmd_str;
1467 r = execTok (spec, &s, &cmd_str, &cmd_len);
1474 r = execTok (spec, &s, &cmd_str, &cmd_len);
1477 p = regxStrz (cmd_str, cmd_len, ptmp);
1478 if (!strcmp (p, "begin"))
1480 r = execTok (spec, &s, &cmd_str, &cmd_len);
1483 logf (LOG_WARN, "missing keyword after 'begin'");
1486 p = regxStrz (cmd_str, cmd_len, ptmp);
1487 if (!strcmp (p, "record"))
1489 r = execTok (spec, &s, &cmd_str, &cmd_len);
1492 if (spec->d1_level == 0)
1494 static char absynName[64];
1499 memcpy (absynName, cmd_str, cmd_len);
1500 absynName[cmd_len] = '\0';
1503 logf (LOG_DEBUG, "begin record %s", absynName);
1505 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1506 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1511 res = data1_mk_node (spec->dh, spec->m);
1512 res->which = DATA1N_root;
1513 res->u.root.type = absynName;
1514 res->u.root.absyn = absyn;
1517 spec->d1_stack[spec->d1_level] = res;
1518 spec->d1_stack[++(spec->d1_level)] = NULL;
1521 r = execTok (spec, &s, &cmd_str, &cmd_len);
1523 else if (!strcmp (p, "element"))
1525 r = execTok (spec, &s, &cmd_str, &cmd_len);
1528 tagBegin (spec, cmd_str, cmd_len);
1529 r = execTok (spec, &s, &cmd_str, &cmd_len);
1531 else if (!strcmp (p, "variant"))
1534 const char *class_str = NULL;
1536 const char *type_str = NULL;
1538 const char *value_str = NULL;
1539 r = execTok (spec, &s, &cmd_str, &cmd_len);
1542 class_str = cmd_str;
1543 class_len = cmd_len;
1544 r = execTok (spec, &s, &cmd_str, &cmd_len);
1550 r = execTok (spec, &s, &cmd_str, &cmd_len);
1553 value_str = cmd_str;
1554 value_len = cmd_len;
1556 variantBegin (spec, class_str, class_len,
1557 type_str, type_len, value_str, value_len);
1560 r = execTok (spec, &s, &cmd_str, &cmd_len);
1562 else if (!strcmp (p, "context"))
1566 struct lexContext *lc = spec->context;
1567 r = execTok (spec, &s, &cmd_str, &cmd_len);
1568 p = regxStrz (cmd_str, cmd_len, ptmp);
1570 logf (LOG_DEBUG, "begin context %s", p);
1572 while (lc && strcmp (p, lc->name))
1575 spec->context_stack[++(spec->context_stack_top)] = lc;
1577 logf (LOG_WARN, "unknown context %s", p);
1580 r = execTok (spec, &s, &cmd_str, &cmd_len);
1584 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1587 else if (!strcmp (p, "end"))
1589 r = execTok (spec, &s, &cmd_str, &cmd_len);
1592 logf (LOG_WARN, "missing keyword after 'end'");
1595 p = regxStrz (cmd_str, cmd_len, ptmp);
1596 if (!strcmp (p, "record"))
1598 while (spec->d1_level)
1600 tagDataRelease (spec);
1603 r = execTok (spec, &s, &cmd_str, &cmd_len);
1605 logf (LOG_DEBUG, "end record");
1607 spec->stop_flag = 1;
1609 else if (!strcmp (p, "element"))
1612 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1614 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1619 tagEnd (spec, min_level, cmd_str, cmd_len);
1620 r = execTok (spec, &s, &cmd_str, &cmd_len);
1623 tagEnd (spec, min_level, NULL, 0);
1624 if (spec->d1_level == 0)
1627 logf (LOG_DEBUG, "end element end records");
1629 spec->stop_flag = 1;
1633 else if (!strcmp (p, "context"))
1636 logf (LOG_DEBUG, "end context");
1638 if (spec->context_stack_top)
1639 (spec->context_stack_top)--;
1640 r = execTok (spec, &s, &cmd_str, &cmd_len);
1643 logf (LOG_WARN, "bad keyword '%s' after end", p);
1645 else if (!strcmp (p, "data"))
1649 const char *element_str = NULL;
1651 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1653 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1655 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1657 r = execTok (spec, &s, &element_str, &element_len);
1662 logf (LOG_WARN, "bad data option: %.*s",
1667 logf (LOG_WARN, "missing data item after data");
1671 tagBegin (spec, element_str, element_len);
1674 execData (spec, cmd_str, cmd_len,textFlag);
1675 r = execTok (spec, &s, &cmd_str, &cmd_len);
1678 tagEnd (spec, 1, NULL, 0);
1680 else if (!strcmp (p, "unread"))
1683 r = execTok (spec, &s, &cmd_str, &cmd_len);
1684 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1686 r = execTok (spec, &s, &cmd_str, &cmd_len);
1689 logf (LOG_WARN, "missing number after -offset");
1692 p = regxStrz (cmd_str, cmd_len, ptmp);
1694 r = execTok (spec, &s, &cmd_str, &cmd_len);
1700 logf (LOG_WARN, "missing index after unread command");
1703 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1705 logf (LOG_WARN, "bad index after unread command");
1710 no = *cmd_str - '0';
1711 if (no >= spec->arg_no)
1712 no = spec->arg_no - 1;
1713 spec->ptr = spec->arg_start[no] + offset;
1715 r = execTok (spec, &s, &cmd_str, &cmd_len);
1717 else if (!strcmp (p, "context"))
1721 struct lexContext *lc = spec->context;
1722 r = execTok (spec, &s, &cmd_str, &cmd_len);
1723 p = regxStrz (cmd_str, cmd_len, ptmp);
1725 while (lc && strcmp (p, lc->name))
1728 spec->context_stack[spec->context_stack_top] = lc;
1730 logf (LOG_WARN, "unknown context %s", p);
1733 r = execTok (spec, &s, &cmd_str, &cmd_len);
1737 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1738 r = execTok (spec, &s, &cmd_str, &cmd_len);
1743 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1745 r = execTok (spec, &s, &cmd_str, &cmd_len);
1752 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1753 int start_ptr, int *pptr)
1762 arg_start[0] = start_ptr;
1764 spec->arg_start = arg_start;
1765 spec->arg_end = arg_end;
1772 if (ap->u.pattern.body)
1774 arg_start[arg_no] = *pptr;
1775 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1777 arg_end[arg_no] = F_WIN_EOF;
1779 arg_start[arg_no] = F_WIN_EOF;
1780 arg_end[arg_no] = F_WIN_EOF;
1785 arg_end[arg_no] = sptr;
1787 arg_start[arg_no] = sptr;
1788 arg_end[arg_no] = *pptr;
1793 arg_start[arg_no] = *pptr;
1794 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1796 if (sptr != arg_start[arg_no])
1798 arg_end[arg_no] = *pptr;
1803 spec->arg_no = arg_no;
1806 if (spec->tcl_interp)
1807 execTcl(spec, ap->u.code);
1809 execCode (spec, ap->u.code);
1811 execCode (spec, ap->u.code);
1814 if (spec->stop_flag)
1818 arg_start[arg_no] = *pptr;
1819 arg_end[arg_no] = F_WIN_EOF;
1828 static int execRule (struct lexSpec *spec, struct lexContext *context,
1829 int ruleNo, int start_ptr, int *pptr)
1832 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1834 return execAction (spec, context->fastRule[ruleNo]->actionList,
1838 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1840 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1841 struct DFA_state *state = context->dfa->states[0];
1844 unsigned char c_prev = '\n';
1846 int last_rule = 0; /* rule number of current match */
1847 int last_ptr = *ptr; /* last char of match */
1848 int start_ptr = *ptr; /* first char of match */
1849 int skip_ptr = *ptr; /* first char of run */
1853 c = f_win_advance (spec, ptr);
1854 if (*ptr == F_WIN_EOF)
1856 /* end of file met */
1859 /* there was a match */
1860 if (skip_ptr < start_ptr)
1862 /* deal with chars that didn't match */
1865 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1866 execDataP (spec, buf, size, 0);
1868 /* restore pointer */
1871 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1873 /* restore skip pointer */
1877 else if (skip_ptr < *ptr)
1879 /* deal with chars that didn't match */
1882 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1883 execDataP (spec, buf, size, 0);
1885 if (*ptr == F_WIN_EOF)
1892 { /* no transition for character c ... */
1895 if (skip_ptr < start_ptr)
1897 /* deal with chars that didn't match */
1900 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1901 execDataP (spec, buf, size, 0);
1903 /* restore pointer */
1905 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1907 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1910 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1912 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1916 context = spec->context_stack[spec->context_stack_top];
1919 last_ptr = start_ptr = *ptr;
1923 c_prev = f_win_advance (spec, &start_ptr);
1928 c_prev = f_win_advance (spec, &start_ptr);
1931 state = context->dfa->states[0];
1934 else if (c >= t->ch[0] && c <= t->ch[1])
1935 { /* transition ... */
1936 state = context->dfa->states[t->to];
1941 last_rule = state->rule_no;
1944 else if (state->rule_nno)
1946 last_rule = state->rule_nno;
1958 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1959 const char *context_name)
1961 struct lexContext *lt = spec->context;
1964 spec->stop_flag = 0;
1966 spec->context_stack_top = 0;
1969 if (!strcmp (lt->name, context_name))
1975 logf (LOG_WARN, "cannot find context %s", context_name);
1978 spec->context_stack[spec->context_stack_top] = lt;
1979 spec->d1_stack[spec->d1_level] = NULL;
1984 execAction (spec, lt->initActionList, ptr, &ptr);
1987 execAction (spec, lt->beginActionList, ptr, &ptr);
1988 lexNode (spec, &ptr);
1989 while (spec->d1_level)
1991 tagDataRelease (spec);
1994 execAction (spec, lt->endActionList, ptr, &ptr);
1995 return spec->d1_stack[0];
1998 void grs_destroy(void *clientData)
2000 struct lexSpecs *specs = (struct lexSpecs *) clientData;
2003 lexSpecDestroy(&specs->spec);
2008 void *grs_init(void)
2010 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
2015 data1_node *grs_read_regx (struct grs_read_info *p)
2018 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2019 struct lexSpec **curLexSpec = &specs->spec;
2022 logf (LOG_DEBUG, "grs_read_regx");
2024 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2027 lexSpecDestroy (curLexSpec);
2028 *curLexSpec = lexSpecCreate (p->type, p->dh);
2029 res = readFileSpec (*curLexSpec);
2032 lexSpecDestroy (curLexSpec);
2036 (*curLexSpec)->dh = p->dh;
2039 (*curLexSpec)->f_win_start = 0;
2040 (*curLexSpec)->f_win_end = 0;
2041 (*curLexSpec)->f_win_rf = p->readf;
2042 (*curLexSpec)->f_win_sf = p->seekf;
2043 (*curLexSpec)->f_win_fh = p->fh;
2044 (*curLexSpec)->f_win_ef = p->endf;
2045 (*curLexSpec)->f_win_size = 500000;
2047 (*curLexSpec)->m = p->mem;
2048 return lexRoot (*curLexSpec, p->offset, "main");
2051 static struct recTypeGrs regx_type = {
2058 RecTypeGrs recTypeGrs_regx = ®x_type;
2061 data1_node *grs_read_tcl (struct grs_read_info *p)
2064 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2065 struct lexSpec **curLexSpec = &specs->spec;
2068 logf (LOG_DEBUG, "grs_read_tcl");
2070 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2072 Tcl_Interp *tcl_interp;
2074 lexSpecDestroy (curLexSpec);
2075 *curLexSpec = lexSpecCreate (p->type, p->dh);
2076 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2077 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2078 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2079 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2080 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2082 res = readFileSpec (*curLexSpec);
2085 lexSpecDestroy (curLexSpec);
2089 (*curLexSpec)->dh = p->dh;
2092 (*curLexSpec)->f_win_start = 0;
2093 (*curLexSpec)->f_win_end = 0;
2094 (*curLexSpec)->f_win_rf = p->readf;
2095 (*curLexSpec)->f_win_sf = p->seekf;
2096 (*curLexSpec)->f_win_fh = p->fh;
2097 (*curLexSpec)->f_win_ef = p->endf;
2098 (*curLexSpec)->f_win_size = 500000;
2100 (*curLexSpec)->m = p->mem;
2101 return lexRoot (*curLexSpec, p->offset, "main");
2104 static struct recTypeGrs tcl_type = {
2111 RecTypeGrs recTypeGrs_tcl = &tcl_type;