2 * Copyright (C) 1994-1999, Index Data
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.26 1999-05-26 07:49:14 adam
10 * Revision 1.25 1999/05/25 12:33:32 adam
11 * Fixed bug in Tcl filter.
13 * Revision 1.24 1999/05/21 11:08:46 adam
14 * Tcl filter attempts to read <filt>.tflt. Improvements to configure
15 * script so that it reads uninstalled Tcl source.
17 * Revision 1.23 1999/05/20 12:57:18 adam
18 * Implemented TCL filter. Updated recctrl system.
20 * Revision 1.22 1998/11/03 16:07:13 adam
23 * Revision 1.21 1998/11/03 15:43:39 adam
24 * Fixed bug introduced by previous commit.
26 * Revision 1.20 1998/11/03 14:51:28 adam
27 * Changed code so that it creates as few data1 nodes as possible.
29 * Revision 1.19 1998/11/03 10:22:39 adam
30 * Fixed memory leak that could occur for when large data1 node were
31 * concatenated. Data-type data1_nodes may have multiple nodes.
33 * Revision 1.18 1998/10/15 13:11:47 adam
34 * Added support for option -record for "end element". When specified
35 * end element will mark end-of-record when at outer-level.
37 * Revision 1.17 1998/07/01 10:13:51 adam
40 * Revision 1.16 1998/06/30 15:15:09 adam
41 * Tags are trimmed: white space removed before- and after the tag.
43 * Revision 1.15 1998/06/30 12:55:45 adam
46 * Revision 1.14 1998/03/05 08:41:00 adam
47 * Implemented rule contexts.
49 * Revision 1.13 1997/12/12 06:33:58 adam
50 * Fixed bug that showed up when multiple filter where used.
51 * Made one routine thread-safe.
53 * Revision 1.12 1997/11/18 10:03:24 adam
54 * Member num_children removed from data1_node.
56 * Revision 1.11 1997/11/06 11:41:01 adam
57 * Implemented "begin variant" for the sgml.regx filter.
59 * Revision 1.10 1997/10/31 12:36:12 adam
60 * Minor change that avoids compiler warning.
62 * Revision 1.9 1997/09/29 09:02:49 adam
63 * Fixed small bug (introduced by previous commit).
65 * Revision 1.8 1997/09/17 12:19:22 adam
66 * Zebra version corresponds to YAZ version 1.4.
67 * Changed Zebra server so that it doesn't depend on global common_resource.
69 * Revision 1.7 1997/07/15 16:33:07 adam
70 * Check for zero length in execData.
72 * Revision 1.6 1997/02/24 10:41:51 adam
73 * Cleanup of code and commented out the "end element-end-record" code.
75 * Revision 1.5 1997/02/19 16:22:33 adam
76 * Fixed "end element" to terminate record in outer-most level.
78 * Revision 1.4 1997/02/12 20:42:58 adam
79 * Changed some log messages.
81 * Revision 1.3 1996/11/08 14:05:33 adam
82 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
84 * Revision 1.2 1996/10/29 14:02:09 adam
85 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
86 * data1_get_tabpath is used.
88 * Revision 1.1 1996/10/11 10:57:30 adam
89 * New module recctrl. Used to manage records (extract/retrieval).
91 * Revision 1.24 1996/06/17 14:25:31 adam
92 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
94 * Revision 1.23 1996/06/04 10:19:00 adam
95 * Minor changes - removed include of ctype.h.
97 * Revision 1.22 1996/06/03 15:23:13 adam
98 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
100 * Revision 1.21 1996/05/14 16:58:38 adam
103 * Revision 1.20 1996/05/01 13:46:36 adam
104 * First work on multiple records in one file.
105 * New option, -offset, to the "unread" command in the filter module.
107 * Revision 1.19 1996/02/12 16:18:20 adam
108 * Yet another bug fix in implementation of unread command.
110 * Revision 1.18 1996/02/12 16:07:54 adam
111 * Bug fix in new unread command.
113 * Revision 1.17 1996/02/12 15:56:11 adam
114 * New code command: unread.
116 * Revision 1.16 1996/01/17 14:57:51 adam
117 * Prototype changed for reader functions in extract/retrieve. File
118 * is identified by 'void *' instead of 'int.
120 * Revision 1.15 1996/01/08 19:15:47 adam
121 * New input filter that works!
123 * Revision 1.14 1996/01/08 09:10:38 adam
124 * Yet another complete rework on this module.
126 * Revision 1.13 1995/12/15 17:21:50 adam
127 * This version is able to set data.formatted_text in data1-nodes.
129 * Revision 1.12 1995/12/15 16:20:10 adam
130 * The filter files (*.flt) are read from the path given by data1_tabpath.
132 * Revision 1.11 1995/12/15 12:35:16 adam
135 * Revision 1.10 1995/12/15 10:35:36 adam
138 * Revision 1.9 1995/12/14 16:38:48 adam
139 * Completely new attempt to make regular expression parsing.
141 * Revision 1.8 1995/12/13 17:16:59 adam
144 * Revision 1.7 1995/12/13 16:51:58 adam
145 * Modified to set last_child in data1_nodes.
146 * Uses destroy handler to free up data text nodes.
148 * Revision 1.6 1995/12/13 13:45:37 quinn
149 * Changed data1 to use nmem.
151 * Revision 1.5 1995/12/11 09:12:52 adam
152 * The rec_get function returns NULL if record doesn't exist - will
153 * happen in the server if the result set records have been deleted since
154 * the creation of the set (i.e. the search).
155 * The server saves a result temporarily if it is 'volatile', i.e. the
156 * set is register dependent.
158 * Revision 1.4 1995/12/05 16:57:40 adam
159 * More work on regular patterns.
161 * Revision 1.3 1995/12/05 09:37:09 adam
162 * One malloc was renamed to xmalloc.
164 * Revision 1.2 1995/12/04 17:59:24 adam
165 * More work on regular expression conversion.
167 * Revision 1.1 1995/12/04 14:25:30 adam
168 * Started work on regular expression parsed input to structured records.
177 #include <zebrautl.h>
187 #define F_WIN_EOF 2000000000
191 #define REGX_PATTERN 1
196 #define REGX_CONTEXT 6
203 struct lexRuleAction {
207 struct DFA *dfa; /* REGX_PATTERN */
210 struct regxCode *code; /* REGX_CODE */
212 struct lexRuleAction *next;
217 struct lexRuleAction *actionList;
221 struct lexRuleInfo info;
222 struct lexRule *next;
228 struct lexRule *rules;
229 struct lexRuleInfo **fastRule;
233 struct lexRuleAction *beginActionList;
234 struct lexRuleAction *endActionList;
235 struct lexRuleAction *initActionList;
236 struct lexContext *next;
239 struct lexConcatBuf {
247 struct lexContext *context;
249 struct lexContext **context_stack;
250 int context_stack_size;
251 int context_stack_top;
257 Tcl_Interp *tcl_interp;
260 void (*f_win_ef)(void *, off_t);
262 int f_win_start; /* first byte of buffer is this file offset */
263 int f_win_end; /* last byte of buffer is this offset - 1 */
264 int f_win_size; /* size of buffer */
265 char *f_win_buf; /* buffer itself */
266 int (*f_win_rf)(void *, char *, size_t);
267 off_t (*f_win_sf)(void *, off_t);
269 struct lexConcatBuf **concatBuf;
271 data1_node **d1_stack;
282 struct lexSpec *spec;
285 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
288 int i, r, off = start_pos - spec->f_win_start;
290 if (off >= 0 && end_pos <= spec->f_win_end)
292 *size = end_pos - start_pos;
293 return spec->f_win_buf + off;
295 if (off < 0 || start_pos >= spec->f_win_end)
297 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
298 spec->f_win_start = start_pos;
300 if (!spec->f_win_buf)
301 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
302 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
304 spec->f_win_end = spec->f_win_start + *size;
306 if (*size > end_pos - start_pos)
307 *size = end_pos - start_pos;
308 return spec->f_win_buf;
310 for (i = 0; i<spec->f_win_end - start_pos; i++)
311 spec->f_win_buf[i] = spec->f_win_buf[i + off];
312 r = (*spec->f_win_rf)(spec->f_win_fh,
314 spec->f_win_size - i);
315 spec->f_win_start = start_pos;
316 spec->f_win_end += r;
318 if (*size > end_pos - start_pos)
319 *size = end_pos - start_pos;
320 return spec->f_win_buf;
323 static int f_win_advance (struct lexSpec *spec, int *pos)
328 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
329 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
330 if (*pos == F_WIN_EOF)
332 buf = f_win_get (spec, *pos, *pos+1, &size);
342 static void regxCodeDel (struct regxCode **pp)
344 struct regxCode *p = *pp;
353 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
357 p = (struct regxCode *) xmalloc (sizeof(*p));
358 p->str = (char *) xmalloc (len+1);
359 memcpy (p->str, buf, len);
364 static struct DFA *lexSpecDFA (void)
369 dfa_parse_cmap_del (dfa, ' ');
370 dfa_parse_cmap_del (dfa, '\t');
371 dfa_parse_cmap_add (dfa, '/', 0);
375 static void actionListDel (struct lexRuleAction **rap)
377 struct lexRuleAction *ra1, *ra;
379 for (ra = *rap; ra; ra = ra1)
385 dfa_delete (&ra->u.pattern.dfa);
388 regxCodeDel (&ra->u.code);
396 static struct lexContext *lexContextCreate (const char *name)
398 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
400 p->name = xstrdup (name);
403 p->dfa = lexSpecDFA ();
406 p->beginActionList = NULL;
407 p->endActionList = NULL;
408 p->initActionList = NULL;
413 static void lexContextDestroy (struct lexContext *p)
415 struct lexRule *rp, *rp1;
418 for (rp = p->rules; rp; rp = rp1)
421 actionListDel (&rp->info.actionList);
424 actionListDel (&p->beginActionList);
425 actionListDel (&p->endActionList);
430 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
435 p = (struct lexSpec *) xmalloc (sizeof(*p));
436 p->name = (char *) xmalloc (strlen(name)+1);
437 strcpy (p->name, name);
444 p->context_stack_size = 100;
445 p->context_stack = (struct lexContext **)
446 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
450 p->concatBuf = (struct lexConcatBuf **)
451 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
452 for (i = 0; i < p->maxLevel; i++)
454 p->concatBuf[i] = (struct lexConcatBuf *)
455 xmalloc (sizeof(**p->concatBuf));
456 p->concatBuf[i]->len = p->concatBuf[i]->max = 0;
457 p->concatBuf[i]->buf = 0;
459 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
464 static void lexSpecDestroy (struct lexSpec **pp)
467 struct lexContext *lt;
475 for (i = 0; i < p->maxLevel; i++)
476 xfree (p->concatBuf[i]);
477 xfree (p->concatBuf);
482 struct lexContext *lt_next = lt->next;
483 lexContextDestroy (lt);
488 Tcl_DeleteInterp (p->tcl_interp);
491 xfree (p->f_win_buf);
492 xfree (p->context_stack);
498 static int readParseToken (const char **cpp, int *len)
500 const char *cp = *cpp;
504 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
533 if (*cp >= 'a' && *cp <= 'z')
535 else if (*cp >= 'A' && *cp <= 'Z')
536 cmd[i] = *cp + 'a' - 'A';
539 if (i < (int) sizeof(cmd)-2)
546 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
548 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
554 if (!strcmp (cmd, "begin"))
556 else if (!strcmp (cmd, "end"))
558 else if (!strcmp (cmd, "body"))
560 else if (!strcmp (cmd, "context"))
562 else if (!strcmp (cmd, "init"))
566 logf (LOG_WARN, "bad command %s", cmd);
572 static int actionListMk (struct lexSpec *spec, const char *s,
573 struct lexRuleAction **ap)
579 while ((tok = readParseToken (&s, &len)))
587 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
589 regxCodeMk (&(*ap)->u.code, s, len);
593 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
595 (*ap)->u.pattern.body = bodyMark;
597 (*ap)->u.pattern.dfa = lexSpecDFA ();
599 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
604 logf (LOG_WARN, "regular expression error '%.*s'", s-s0, s0);
607 dfa_mkstate ((*ap)->u.pattern.dfa);
611 logf (LOG_WARN, "cannot use BEGIN here");
614 logf (LOG_WARN, "cannot use INIT here");
617 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
627 int readOneSpec (struct lexSpec *spec, const char *s)
631 struct lexContext *lc;
633 tok = readParseToken (&s, &len);
634 if (tok == REGX_CONTEXT)
636 char context_name[32];
637 tok = readParseToken (&s, &len);
638 if (tok != REGX_CODE)
640 logf (LOG_WARN, "missing name after CONTEXT keyword");
645 memcpy (context_name, s, len);
646 context_name[len] = '\0';
647 lc = lexContextCreate (context_name);
648 lc->next = spec->context;
653 spec->context = lexContextCreate ("main");
658 actionListDel (&spec->context->beginActionList);
659 actionListMk (spec, s, &spec->context->beginActionList);
662 actionListDel (&spec->context->endActionList);
663 actionListMk (spec, s, &spec->context->endActionList);
666 actionListDel (&spec->context->initActionList);
667 actionListMk (spec, s, &spec->context->initActionList);
671 logf (LOG_DEBUG, "rule %d %s", spec->context->ruleNo, s);
673 r = dfa_parse (spec->context->dfa, &s);
676 logf (LOG_WARN, "regular expression error. r=%d", r);
681 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
685 rp = (struct lexRule *) xmalloc (sizeof(*rp));
686 rp->info.no = spec->context->ruleNo++;
687 rp->next = spec->context->rules;
688 spec->context->rules = rp;
689 actionListMk (spec, s, &rp->info.actionList);
694 int readFileSpec (struct lexSpec *spec)
696 struct lexContext *lc;
699 int c, i, errors = 0;
702 lineBuf = (char *) xmalloc (1+lineSize);
704 if (spec->tcl_interp)
706 sprintf (lineBuf, "%s.tflt", spec->name);
707 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
712 sprintf (lineBuf, "%s.flt", spec->name);
713 spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh), lineBuf, "r");
717 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
721 logf (LOG_LOG, "reading regx filter %s", lineBuf);
723 if (spec->tcl_interp)
724 logf (LOG_LOG, "Tcl enabled");
731 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
733 while (c != '\n' && c != EOF)
752 if (c != ' ' && c != '\t')
761 readOneSpec (spec, lineBuf);
762 spec->lineNo += addLine;
771 debug_dfa_followpos = 1;
774 for (lc = spec->context; lc; lc = lc->next)
777 lc->fastRule = (struct lexRuleInfo **)
778 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
779 for (i = 0; i < lc->ruleNo; i++)
780 lc->fastRule[i] = NULL;
781 for (rp = lc->rules; rp; rp = rp->next)
782 lc->fastRule[rp->info.no] = &rp->info;
783 dfa_mkstate (lc->dfa);
792 static struct lexSpec *curLexSpec = NULL;
795 static void execData (struct lexSpec *spec,
796 const char *ebuf, int elen, int formatted_text)
798 struct data1_node *res, *parent;
801 if (elen == 0) /* shouldn't happen, but it does! */
805 logf (LOG_DEBUG, "data (%d bytes) %.15s ... %.*s", elen,
806 ebuf, 15, ebuf + elen-15);
808 logf (LOG_DEBUG, "data (%d bytes) %.*s", elen, elen, ebuf);
810 logf (LOG_DEBUG, "data (%d bytes)", elen);
813 if (spec->d1_level <= 1)
816 parent = spec->d1_stack[spec->d1_level -1];
819 if ((res = spec->d1_stack[spec->d1_level]) && res->which == DATA1N_data)
820 org_len = res->u.data.len;
825 res = data1_mk_node (spec->dh, spec->m);
826 res->parent = parent;
827 res->which = DATA1N_data;
828 res->u.data.what = DATA1I_text;
830 res->u.data.formatted_text = formatted_text;
832 if (elen > DATA1_LOCALDATA)
833 res->u.data.data = nmem_malloc (spec->m, elen);
835 res->u.data.data = res->lbuf;
836 memcpy (res->u.data.data, ebuf, elen);
838 res->u.data.data = 0;
840 res->root = parent->root;
842 parent->last_child = res;
843 if (spec->d1_stack[spec->d1_level])
844 spec->d1_stack[spec->d1_level]->next = res;
847 spec->d1_stack[spec->d1_level] = res;
849 if (org_len + elen >= spec->concatBuf[spec->d1_level]->max)
851 char *old_buf, *new_buf;
853 spec->concatBuf[spec->d1_level]->max = org_len + elen + 256;
854 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level]->max);
855 if ((old_buf = spec->concatBuf[spec->d1_level]->buf))
857 memcpy (new_buf, old_buf, org_len);
860 spec->concatBuf[spec->d1_level]->buf = new_buf;
862 assert (spec->concatBuf[spec->d1_level]);
863 memcpy (spec->concatBuf[spec->d1_level]->buf + org_len, ebuf, elen);
864 res->u.data.len += elen;
867 static void execDataP (struct lexSpec *spec,
868 const char *ebuf, int elen, int formatted_text)
870 execData (spec, ebuf, elen, formatted_text);
873 static void tagDataRelease (struct lexSpec *spec)
877 if ((res = spec->d1_stack[spec->d1_level]) &&
878 res->which == DATA1N_data &&
879 res->u.data.what == DATA1I_text)
881 assert (!res->u.data.data);
882 assert (res->u.data.len > 0);
883 if (res->u.data.len > DATA1_LOCALDATA)
884 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
886 res->u.data.data = res->lbuf;
887 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level]->buf,
892 static void variantBegin (struct lexSpec *spec,
893 const char *class_str, int class_len,
894 const char *type_str, int type_len,
895 const char *value_str, int value_len)
897 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
898 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
903 if (spec->d1_level == 0)
905 logf (LOG_WARN, "in variant begin. No record type defined");
908 if (class_len >= DATA1_MAX_SYMBOL)
909 class_len = DATA1_MAX_SYMBOL-1;
910 memcpy (tclass, class_str, class_len);
911 tclass[class_len] = '\0';
913 if (type_len >= DATA1_MAX_SYMBOL)
914 type_len = DATA1_MAX_SYMBOL-1;
915 memcpy (ttype, type_str, type_len);
916 ttype[type_len] = '\0';
919 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype,
924 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
928 if (parent->which != DATA1N_variant)
930 res = data1_mk_node (spec->dh, spec->m);
931 res->parent = parent;
932 res->which = DATA1N_variant;
933 res->u.variant.type = 0;
934 res->u.variant.value = 0;
935 res->root = parent->root;
937 parent->last_child = res;
938 if (spec->d1_stack[spec->d1_level])
940 tagDataRelease (spec);
941 spec->d1_stack[spec->d1_level]->next = res;
945 spec->d1_stack[spec->d1_level] = res;
946 spec->d1_stack[++(spec->d1_level)] = NULL;
948 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
949 if (spec->d1_stack[i]->u.variant.type == tp)
956 logf (LOG_DEBUG, "variant node (%d)", spec->d1_level);
958 parent = spec->d1_stack[spec->d1_level-1];
959 res = data1_mk_node (spec->dh, spec->m);
960 res->parent = parent;
961 res->which = DATA1N_variant;
962 res->root = parent->root;
963 res->u.variant.type = tp;
965 if (value_len >= DATA1_LOCALDATA)
966 value_len =DATA1_LOCALDATA-1;
967 memcpy (res->lbuf, value_str, value_len);
968 res->lbuf[value_len] = '\0';
970 res->u.variant.value = res->lbuf;
972 parent->last_child = res;
973 if (spec->d1_stack[spec->d1_level])
975 tagDataRelease (spec);
976 spec->d1_stack[spec->d1_level]->next = res;
980 spec->d1_stack[spec->d1_level] = res;
981 spec->d1_stack[++(spec->d1_level)] = NULL;
984 static void tagStrip (const char **tag, int *len)
988 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
991 for (i = 0; i < *len && isspace((*tag)[i]); i++)
997 static void tagBegin (struct lexSpec *spec,
998 const char *tag, int len)
1000 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
1001 data1_element *elem = NULL;
1002 data1_node *partag = get_parent_tag(spec->dh, parent);
1004 data1_element *e = NULL;
1007 if (spec->d1_level == 0)
1009 logf (LOG_WARN, "in element begin. No record type defined");
1012 tagStrip (&tag, &len);
1014 res = data1_mk_node (spec->dh, spec->m);
1015 res->parent = parent;
1016 res->which = DATA1N_tag;
1017 res->u.tag.get_bytes = -1;
1019 if (len >= DATA1_LOCALDATA)
1020 res->u.tag.tag = (char *) nmem_malloc (spec->m, len+1);
1022 res->u.tag.tag = res->lbuf;
1024 memcpy (res->u.tag.tag, tag, len);
1025 res->u.tag.tag[len] = '\0';
1028 logf (LOG_DEBUG, "begin tag %s (%d)", res->u.tag.tag, spec->d1_level);
1030 if (parent->which == DATA1N_variant)
1033 if (!(e = partag->u.tag.element))
1036 elem = data1_getelementbytagname (spec->dh,
1037 spec->d1_stack[0]->u.root.absyn,
1039 res->u.tag.element = elem;
1040 res->u.tag.node_selected = 0;
1041 res->u.tag.make_variantlist = 0;
1042 res->u.tag.no_data_requested = 0;
1043 res->root = parent->root;
1045 parent->last_child = res;
1046 if (spec->d1_stack[spec->d1_level])
1048 tagDataRelease (spec);
1049 spec->d1_stack[spec->d1_level]->next = res;
1052 parent->child = res;
1053 spec->d1_stack[spec->d1_level] = res;
1054 spec->d1_stack[++(spec->d1_level)] = NULL;
1057 static void tagEnd (struct lexSpec *spec, int min_level,
1058 const char *tag, int len)
1060 tagStrip (&tag, &len);
1061 while (spec->d1_level > min_level)
1063 tagDataRelease (spec);
1065 if (spec->d1_level == 0)
1067 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
1069 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
1071 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
1075 logf (LOG_DEBUG, "end tag (%d)", spec->d1_level);
1080 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
1083 struct DFA_state *state = dfa->states[0];
1086 unsigned char c_prev = 0;
1087 int ptr = *pptr; /* current pointer */
1088 int start_ptr = *pptr; /* first char of match */
1089 int last_ptr = 0; /* last char of match */
1090 int last_rule = 0; /* rule number of current match */
1095 c = f_win_advance (spec, &ptr);
1096 if (ptr == F_WIN_EOF)
1113 *mptr = start_ptr; /* match starts here */
1114 *pptr = last_ptr; /* match end here (+1) */
1117 state = dfa->states[0];
1122 else if (c >= t->ch[0] && c <= t->ch[1])
1124 state = dfa->states[t->to];
1129 last_rule = state->rule_no;
1134 last_rule = state->rule_nno;
1146 static int execTok (struct lexSpec *spec, const char **src,
1147 const char **tokBuf, int *tokLen)
1149 const char *s = *src;
1151 while (*s == ' ' || *s == '\t')
1155 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1159 while (*s >= '0' && *s <= '9')
1160 n = n*10 + (*s++ -'0');
1161 if (spec->arg_no == 0)
1168 if (n >= spec->arg_no)
1170 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1174 else if (*s == '\"')
1177 while (*s && *s != '\"')
1179 *tokLen = s - *tokBuf;
1184 else if (*s == '\n' || *s == ';')
1192 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1194 *tokLen = s - *tokBuf;
1201 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
1203 *tokLen = s - *tokBuf;
1209 static char *regxStrz (const char *src, int len, char *str)
1213 memcpy (str, src, len);
1219 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1220 int argc, char **argv)
1222 struct lexSpec *spec = (struct lexSpec *) clientData;
1225 if (!strcmp(argv[1], "record") && argc == 3)
1227 char *absynName = argv[2];
1231 logf (LOG_DEBUG, "begin record %s", absynName);
1233 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1234 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1239 res = data1_mk_node (spec->dh, spec->m);
1240 res->which = DATA1N_root;
1241 res->u.root.type = absynName;
1242 res->u.root.absyn = absyn;
1245 spec->d1_stack[spec->d1_level] = res;
1246 spec->d1_stack[++(spec->d1_level)] = NULL;
1249 else if (!strcmp(argv[1], "element") && argc == 3)
1251 tagBegin (spec, argv[2], strlen(argv[2]));
1253 else if (!strcmp (argv[1], "variant") && argc == 5)
1255 variantBegin (spec, argv[2], strlen(argv[2]),
1256 argv[3], strlen(argv[3]),
1257 argv[4], strlen(argv[4]));
1259 else if (!strcmp (argv[1], "context") && argc == 3)
1261 struct lexContext *lc = spec->context;
1263 logf (LOG_DEBUG, "begin context %s",argv[2]);
1265 while (lc && strcmp (argv[2], lc->name))
1269 spec->context_stack[++(spec->context_stack_top)] = lc;
1272 logf (LOG_WARN, "unknown context %s", argv[2]);
1279 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1280 int argc, char **argv)
1282 struct lexSpec *spec = (struct lexSpec *) clientData;
1286 if (!strcmp (argv[1], "record"))
1288 while (spec->d1_level)
1290 tagDataRelease (spec);
1294 logf (LOG_DEBUG, "end record");
1296 spec->stop_flag = 1;
1298 else if (!strcmp (argv[1], "element"))
1302 if (argc >= 3 && !strcmp(argv[2], "-record"))
1311 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1312 if (spec->d1_level == 0)
1315 logf (LOG_DEBUG, "end element end records");
1317 spec->stop_flag = 1;
1320 else if (!strcmp (argv[1], "context"))
1323 logf (LOG_DEBUG, "end context");
1325 if (spec->context_stack_top)
1326 (spec->context_stack_top)--;
1333 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1334 int argc, char **argv)
1338 const char *element = 0;
1339 struct lexSpec *spec = (struct lexSpec *) clientData;
1343 if (!strcmp("-text", argv[argi]))
1348 else if (!strcmp("-element", argv[argi]))
1352 element = argv[argi++];
1358 tagBegin (spec, element, strlen(element));
1362 execData (spec, argv[argi], strlen(argv[argi]), textFlag);
1366 tagEnd (spec, 1, NULL, 0);
1370 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1371 int argc, char **argv)
1373 struct lexSpec *spec = (struct lexSpec *) clientData;
1380 if (!strcmp("-offset", argv[argi]))
1385 offset = atoi(argv[argi]);
1394 no = atoi(argv[argi]);
1395 if (no >= spec->arg_no)
1396 no = spec->arg_no - 1;
1397 spec->ptr = spec->arg_start[no] + offset;
1401 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1404 for (i = 0; i < spec->arg_no; i++)
1406 char var_name[10], *var_buf;
1409 sprintf (var_name, "%d", i);
1410 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1414 ch = var_buf[var_len];
1415 var_buf[var_len] = '\0';
1416 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1417 var_buf[var_len] = ch;
1420 Tcl_Eval (spec->tcl_interp, code->str);
1425 static void execCode (struct lexSpec *spec, struct regxCode *code)
1427 const char *s = code->str;
1429 const char *cmd_str;
1431 r = execTok (spec, &s, &cmd_str, &cmd_len);
1438 r = execTok (spec, &s, &cmd_str, &cmd_len);
1441 p = regxStrz (cmd_str, cmd_len, ptmp);
1442 if (!strcmp (p, "begin"))
1444 r = execTok (spec, &s, &cmd_str, &cmd_len);
1447 logf (LOG_WARN, "missing keyword after 'begin'");
1450 p = regxStrz (cmd_str, cmd_len, ptmp);
1451 if (!strcmp (p, "record"))
1453 r = execTok (spec, &s, &cmd_str, &cmd_len);
1456 if (spec->d1_level == 0)
1458 static char absynName[64];
1463 memcpy (absynName, cmd_str, cmd_len);
1464 absynName[cmd_len] = '\0';
1467 logf (LOG_DEBUG, "begin record %s", absynName);
1469 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1470 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1475 res = data1_mk_node (spec->dh, spec->m);
1476 res->which = DATA1N_root;
1477 res->u.root.type = absynName;
1478 res->u.root.absyn = absyn;
1481 spec->d1_stack[spec->d1_level] = res;
1482 spec->d1_stack[++(spec->d1_level)] = NULL;
1485 r = execTok (spec, &s, &cmd_str, &cmd_len);
1487 else if (!strcmp (p, "element"))
1489 r = execTok (spec, &s, &cmd_str, &cmd_len);
1492 tagBegin (spec, cmd_str, cmd_len);
1493 r = execTok (spec, &s, &cmd_str, &cmd_len);
1495 else if (!strcmp (p, "variant"))
1498 const char *class_str = NULL;
1500 const char *type_str = NULL;
1502 const char *value_str = NULL;
1503 r = execTok (spec, &s, &cmd_str, &cmd_len);
1506 class_str = cmd_str;
1507 class_len = cmd_len;
1508 r = execTok (spec, &s, &cmd_str, &cmd_len);
1514 r = execTok (spec, &s, &cmd_str, &cmd_len);
1517 value_str = cmd_str;
1518 value_len = cmd_len;
1520 variantBegin (spec, class_str, class_len,
1521 type_str, type_len, value_str, value_len);
1524 r = execTok (spec, &s, &cmd_str, &cmd_len);
1526 else if (!strcmp (p, "context"))
1530 struct lexContext *lc = spec->context;
1531 r = execTok (spec, &s, &cmd_str, &cmd_len);
1532 p = regxStrz (cmd_str, cmd_len, ptmp);
1534 logf (LOG_DEBUG, "begin context %s", p);
1536 while (lc && strcmp (p, lc->name))
1539 spec->context_stack[++(spec->context_stack_top)] = lc;
1541 logf (LOG_WARN, "unknown context %s", p);
1544 r = execTok (spec, &s, &cmd_str, &cmd_len);
1548 logf (LOG_WARN, "bad keyword '%s' after begin", p);
1551 else if (!strcmp (p, "end"))
1553 r = execTok (spec, &s, &cmd_str, &cmd_len);
1556 logf (LOG_WARN, "missing keyword after 'end'");
1559 p = regxStrz (cmd_str, cmd_len, ptmp);
1560 if (!strcmp (p, "record"))
1562 while (spec->d1_level)
1564 tagDataRelease (spec);
1567 r = execTok (spec, &s, &cmd_str, &cmd_len);
1569 logf (LOG_DEBUG, "end record");
1571 spec->stop_flag = 1;
1573 else if (!strcmp (p, "element"))
1576 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1578 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1583 tagEnd (spec, min_level, cmd_str, cmd_len);
1584 r = execTok (spec, &s, &cmd_str, &cmd_len);
1587 tagEnd (spec, min_level, NULL, 0);
1588 if (spec->d1_level == 0)
1591 logf (LOG_DEBUG, "end element end records");
1593 spec->stop_flag = 1;
1597 else if (!strcmp (p, "context"))
1600 logf (LOG_DEBUG, "end context");
1602 if (spec->context_stack_top)
1603 (spec->context_stack_top)--;
1604 r = execTok (spec, &s, &cmd_str, &cmd_len);
1607 logf (LOG_WARN, "bad keyword '%s' after end", p);
1609 else if (!strcmp (p, "data"))
1613 const char *element_str = NULL;
1615 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1617 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1619 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1621 r = execTok (spec, &s, &element_str, &element_len);
1626 logf (LOG_WARN, "bad data option: %.*s",
1631 logf (LOG_WARN, "missing data item after data");
1635 tagBegin (spec, element_str, element_len);
1638 execData (spec, cmd_str, cmd_len,textFlag);
1639 r = execTok (spec, &s, &cmd_str, &cmd_len);
1642 tagEnd (spec, 1, NULL, 0);
1644 else if (!strcmp (p, "unread"))
1647 r = execTok (spec, &s, &cmd_str, &cmd_len);
1648 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1650 r = execTok (spec, &s, &cmd_str, &cmd_len);
1653 logf (LOG_WARN, "missing number after -offset");
1656 p = regxStrz (cmd_str, cmd_len, ptmp);
1658 r = execTok (spec, &s, &cmd_str, &cmd_len);
1664 logf (LOG_WARN, "missing index after unread command");
1667 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1669 logf (LOG_WARN, "bad index after unread command");
1674 no = *cmd_str - '0';
1675 if (no >= spec->arg_no)
1676 no = spec->arg_no - 1;
1677 spec->ptr = spec->arg_start[no] + offset;
1679 r = execTok (spec, &s, &cmd_str, &cmd_len);
1681 else if (!strcmp (p, "context"))
1685 struct lexContext *lc = spec->context;
1686 r = execTok (spec, &s, &cmd_str, &cmd_len);
1687 p = regxStrz (cmd_str, cmd_len, ptmp);
1689 while (lc && strcmp (p, lc->name))
1692 spec->context_stack[spec->context_stack_top] = lc;
1694 logf (LOG_WARN, "unknown context %s", p);
1697 r = execTok (spec, &s, &cmd_str, &cmd_len);
1701 logf (LOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1702 r = execTok (spec, &s, &cmd_str, &cmd_len);
1707 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1709 r = execTok (spec, &s, &cmd_str, &cmd_len);
1716 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1717 int start_ptr, int *pptr)
1726 arg_start[0] = start_ptr;
1728 spec->arg_start = arg_start;
1729 spec->arg_end = arg_end;
1736 if (ap->u.pattern.body)
1738 arg_start[arg_no] = *pptr;
1739 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1741 arg_end[arg_no] = F_WIN_EOF;
1743 arg_start[arg_no] = F_WIN_EOF;
1744 arg_end[arg_no] = F_WIN_EOF;
1749 arg_end[arg_no] = sptr;
1751 arg_start[arg_no] = sptr;
1752 arg_end[arg_no] = *pptr;
1757 arg_start[arg_no] = *pptr;
1758 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1760 if (sptr != arg_start[arg_no])
1762 arg_end[arg_no] = *pptr;
1767 spec->arg_no = arg_no;
1770 if (spec->tcl_interp)
1771 execTcl(spec, ap->u.code);
1773 execCode (spec, ap->u.code);
1775 execCode (spec, ap->u.code);
1778 if (spec->stop_flag)
1782 arg_start[arg_no] = *pptr;
1783 arg_end[arg_no] = F_WIN_EOF;
1792 static int execRule (struct lexSpec *spec, struct lexContext *context,
1793 int ruleNo, int start_ptr, int *pptr)
1796 logf (LOG_DEBUG, "exec rule %d", ruleNo);
1798 return execAction (spec, context->fastRule[ruleNo]->actionList,
1802 data1_node *lexNode (struct lexSpec *spec, int *ptr)
1804 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1805 struct DFA_state *state = context->dfa->states[0];
1808 unsigned char c_prev = '\n';
1810 int last_rule = 0; /* rule number of current match */
1811 int last_ptr = *ptr; /* last char of match */
1812 int start_ptr = *ptr; /* first char of match */
1813 int skip_ptr = *ptr; /* first char of run */
1817 c = f_win_advance (spec, ptr);
1818 if (*ptr == F_WIN_EOF)
1820 /* end of file met */
1823 /* there was a match */
1824 if (skip_ptr < start_ptr)
1826 /* deal with chars that didn't match */
1829 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1830 execDataP (spec, buf, size, 0);
1832 /* restore pointer */
1835 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1837 /* restore skip pointer */
1841 else if (skip_ptr < *ptr)
1843 /* deal with chars that didn't match */
1846 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1847 execDataP (spec, buf, size, 0);
1849 if (*ptr == F_WIN_EOF)
1856 { /* no transition for character c ... */
1859 if (skip_ptr < start_ptr)
1861 /* deal with chars that didn't match */
1864 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1865 execDataP (spec, buf, size, 0);
1867 /* restore pointer */
1869 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1871 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1874 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1876 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1880 context = spec->context_stack[spec->context_stack_top];
1883 last_ptr = start_ptr = *ptr;
1887 c_prev = f_win_advance (spec, &start_ptr);
1892 c_prev = f_win_advance (spec, &start_ptr);
1895 state = context->dfa->states[0];
1898 else if (c >= t->ch[0] && c <= t->ch[1])
1899 { /* transition ... */
1900 state = context->dfa->states[t->to];
1905 last_rule = state->rule_no;
1908 else if (state->rule_nno)
1910 last_rule = state->rule_nno;
1922 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1923 const char *context_name)
1925 struct lexContext *lt = spec->context;
1928 spec->stop_flag = 0;
1930 spec->context_stack_top = 0;
1933 if (!strcmp (lt->name, context_name))
1939 logf (LOG_WARN, "cannot find context %s", context_name);
1942 spec->context_stack[spec->context_stack_top] = lt;
1943 spec->d1_stack[spec->d1_level] = NULL;
1948 execAction (spec, lt->initActionList, ptr, &ptr);
1951 execAction (spec, lt->beginActionList, ptr, &ptr);
1952 lexNode (spec, &ptr);
1953 while (spec->d1_level)
1955 tagDataRelease (spec);
1958 execAction (spec, lt->endActionList, ptr, &ptr);
1959 return spec->d1_stack[0];
1962 void grs_destroy(void *clientData)
1964 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1967 lexSpecDestroy(&specs->spec);
1972 void *grs_init(void)
1974 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1979 data1_node *grs_read_regx (struct grs_read_info *p)
1982 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1983 struct lexSpec **curLexSpec = &specs->spec;
1986 logf (LOG_DEBUG, "grs_read_regx");
1988 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
1991 lexSpecDestroy (curLexSpec);
1992 *curLexSpec = lexSpecCreate (p->type, p->dh);
1993 res = readFileSpec (*curLexSpec);
1996 lexSpecDestroy (curLexSpec);
2000 (*curLexSpec)->dh = p->dh;
2003 (*curLexSpec)->f_win_start = 0;
2004 (*curLexSpec)->f_win_end = 0;
2005 (*curLexSpec)->f_win_rf = p->readf;
2006 (*curLexSpec)->f_win_sf = p->seekf;
2007 (*curLexSpec)->f_win_fh = p->fh;
2008 (*curLexSpec)->f_win_ef = p->endf;
2009 (*curLexSpec)->f_win_size = 500000;
2011 (*curLexSpec)->m = p->mem;
2012 return lexRoot (*curLexSpec, p->offset, "main");
2015 static struct recTypeGrs regx_type = {
2022 RecTypeGrs recTypeGrs_regx = ®x_type;
2025 data1_node *grs_read_tcl (struct grs_read_info *p)
2028 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
2029 struct lexSpec **curLexSpec = &specs->spec;
2032 logf (LOG_DEBUG, "grs_read_tcl");
2034 if (!*curLexSpec || strcmp ((*curLexSpec)->name, p->type))
2036 Tcl_Interp *tcl_interp;
2038 lexSpecDestroy (curLexSpec);
2039 *curLexSpec = lexSpecCreate (p->type, p->dh);
2040 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
2041 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
2042 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
2043 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
2044 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
2046 res = readFileSpec (*curLexSpec);
2049 lexSpecDestroy (curLexSpec);
2053 (*curLexSpec)->dh = p->dh;
2056 (*curLexSpec)->f_win_start = 0;
2057 (*curLexSpec)->f_win_end = 0;
2058 (*curLexSpec)->f_win_rf = p->readf;
2059 (*curLexSpec)->f_win_sf = p->seekf;
2060 (*curLexSpec)->f_win_fh = p->fh;
2061 (*curLexSpec)->f_win_ef = p->endf;
2062 (*curLexSpec)->f_win_size = 500000;
2064 (*curLexSpec)->m = p->mem;
2065 return lexRoot (*curLexSpec, p->offset, "main");
2068 static struct recTypeGrs tcl_type = {
2075 RecTypeGrs recTypeGrs_tcl = &tcl_type;