2 * Copyright (C) 1994-1997, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.13 1997-12-12 06:33:58 adam
8 * Fixed bug that showed up when multiple filter where used.
9 * Made one routine thread-safe.
11 * Revision 1.12 1997/11/18 10:03:24 adam
12 * Member num_children removed from data1_node.
14 * Revision 1.11 1997/11/06 11:41:01 adam
15 * Implemented "begin variant" for the sgml.regx filter.
17 * Revision 1.10 1997/10/31 12:36:12 adam
18 * Minor change that avoids compiler warning.
20 * Revision 1.9 1997/09/29 09:02:49 adam
21 * Fixed small bug (introduced by previous commit).
23 * Revision 1.8 1997/09/17 12:19:22 adam
24 * Zebra version corresponds to YAZ version 1.4.
25 * Changed Zebra server so that it doesn't depend on global common_resource.
27 * Revision 1.7 1997/07/15 16:33:07 adam
28 * Check for zero length in execData.
30 * Revision 1.6 1997/02/24 10:41:51 adam
31 * Cleanup of code and commented out the "end element-end-record" code.
33 * Revision 1.5 1997/02/19 16:22:33 adam
34 * Fixed "end element" to terminate record in outer-most level.
36 * Revision 1.4 1997/02/12 20:42:58 adam
37 * Changed some log messages.
39 * Revision 1.3 1996/11/08 14:05:33 adam
40 * Bug fix: data1 node member u.tag.get_bytes weren't initialized.
42 * Revision 1.2 1996/10/29 14:02:09 adam
43 * Doesn't use the global data1_tabpath (from YAZ). Instead the function
44 * data1_get_tabpath is used.
46 * Revision 1.1 1996/10/11 10:57:30 adam
47 * New module recctrl. Used to manage records (extract/retrieval).
49 * Revision 1.24 1996/06/17 14:25:31 adam
50 * Removed LOG_DEBUG logs; can still be enabled by setting REGX_DEBUG.
52 * Revision 1.23 1996/06/04 10:19:00 adam
53 * Minor changes - removed include of ctype.h.
55 * Revision 1.22 1996/06/03 15:23:13 adam
56 * Bug fix: /../ BODY /../ - pattern didn't match EOF.
58 * Revision 1.21 1996/05/14 16:58:38 adam
61 * Revision 1.20 1996/05/01 13:46:36 adam
62 * First work on multiple records in one file.
63 * New option, -offset, to the "unread" command in the filter module.
65 * Revision 1.19 1996/02/12 16:18:20 adam
66 * Yet another bug fix in implementation of unread command.
68 * Revision 1.18 1996/02/12 16:07:54 adam
69 * Bug fix in new unread command.
71 * Revision 1.17 1996/02/12 15:56:11 adam
72 * New code command: unread.
74 * Revision 1.16 1996/01/17 14:57:51 adam
75 * Prototype changed for reader functions in extract/retrieve. File
76 * is identified by 'void *' instead of 'int.
78 * Revision 1.15 1996/01/08 19:15:47 adam
79 * New input filter that works!
81 * Revision 1.14 1996/01/08 09:10:38 adam
82 * Yet another complete rework on this module.
84 * Revision 1.13 1995/12/15 17:21:50 adam
85 * This version is able to set data.formatted_text in data1-nodes.
87 * Revision 1.12 1995/12/15 16:20:10 adam
88 * The filter files (*.flt) are read from the path given by data1_tabpath.
90 * Revision 1.11 1995/12/15 12:35:16 adam
93 * Revision 1.10 1995/12/15 10:35:36 adam
96 * Revision 1.9 1995/12/14 16:38:48 adam
97 * Completely new attempt to make regular expression parsing.
99 * Revision 1.8 1995/12/13 17:16:59 adam
102 * Revision 1.7 1995/12/13 16:51:58 adam
103 * Modified to set last_child in data1_nodes.
104 * Uses destroy handler to free up data text nodes.
106 * Revision 1.6 1995/12/13 13:45:37 quinn
107 * Changed data1 to use nmem.
109 * Revision 1.5 1995/12/11 09:12:52 adam
110 * The rec_get function returns NULL if record doesn't exist - will
111 * happen in the server if the result set records have been deleted since
112 * the creation of the set (i.e. the search).
113 * The server saves a result temporarily if it is 'volatile', i.e. the
114 * set is register dependent.
116 * Revision 1.4 1995/12/05 16:57:40 adam
117 * More work on regular patterns.
119 * Revision 1.3 1995/12/05 09:37:09 adam
120 * One malloc was renamed to xmalloc.
122 * Revision 1.2 1995/12/04 17:59:24 adam
123 * More work on regular expression conversion.
125 * Revision 1.1 1995/12/04 14:25:30 adam
126 * Started work on regular expression parsed input to structured records.
134 #include <zebrautl.h>
140 #define F_WIN_EOF 2000000000
144 #define REGX_PATTERN 1
154 struct lexRuleAction {
158 struct DFA *dfa; /* REGX_PATTERN */
161 struct regxCode *code; /* REGX_CODE */
163 struct lexRuleAction *next;
168 struct lexRuleAction *actionList;
172 struct lexRuleInfo info;
173 struct lexRule *next;
178 struct lexRule *rules;
179 struct lexRuleInfo **fastRule;
185 struct lexTrans trans;
190 void (*f_win_ef)(void *, off_t);
196 int (*f_win_rf)(void *, char *, size_t);
197 off_t (*f_win_sf)(void *, off_t);
199 struct lexRuleAction *beginActionList;
200 struct lexRuleAction *endActionList;
204 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
209 if (start_pos < spec->f_win_start || start_pos >= spec->f_win_end)
211 (*spec->f_win_sf)(spec->f_win_fh, start_pos);
212 spec->f_win_start = start_pos;
214 if (!spec->f_win_buf)
215 spec->f_win_buf = xmalloc (spec->f_win_size);
216 *size = (*spec->f_win_rf)(spec->f_win_fh, spec->f_win_buf,
218 spec->f_win_end = spec->f_win_start + *size;
220 if (*size > end_pos - start_pos)
221 *size = end_pos - start_pos;
222 return spec->f_win_buf;
224 if (end_pos <= spec->f_win_end)
226 *size = end_pos - start_pos;
227 return spec->f_win_buf + (start_pos - spec->f_win_start);
229 off = start_pos - spec->f_win_start;
230 for (i = 0; i<spec->f_win_end - start_pos; i++)
231 spec->f_win_buf[i] = spec->f_win_buf[i + off];
232 r = (*spec->f_win_rf)(spec->f_win_fh,
234 spec->f_win_size - i);
235 spec->f_win_start = start_pos;
236 spec->f_win_end += r;
238 if (*size > end_pos - start_pos)
239 *size = end_pos - start_pos;
240 return spec->f_win_buf;
243 static int f_win_advance (struct lexSpec *spec, int *pos)
248 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
249 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
250 if (*pos == F_WIN_EOF)
252 buf = f_win_get (spec, *pos, *pos+1, &size);
262 static void regxCodeDel (struct regxCode **pp)
264 struct regxCode *p = *pp;
273 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
277 p = xmalloc (sizeof(*p));
278 p->str = xmalloc (len+1);
279 memcpy (p->str, buf, len);
284 static struct DFA *lexSpecDFA (void)
289 dfa_parse_cmap_del (dfa, ' ');
290 dfa_parse_cmap_del (dfa, '\t');
291 dfa_parse_cmap_add (dfa, '/', 0);
295 static struct lexSpec *lexSpecMk (const char *name)
299 p = xmalloc (sizeof(*p));
300 p->name = xmalloc (strlen(name)+1);
301 strcpy (p->name, name);
302 p->trans.dfa = lexSpecDFA ();
303 p->trans.rules = NULL;
304 p->trans.fastRule = NULL;
305 p->beginActionList = NULL;
306 p->endActionList = NULL;
311 static void actionListDel (struct lexRuleAction **rap)
313 struct lexRuleAction *ra1, *ra;
315 for (ra = *rap; ra; ra = ra1)
321 dfa_delete (&ra->u.pattern.dfa);
324 regxCodeDel (&ra->u.code);
332 static void lexSpecDel (struct lexSpec **pp)
335 struct lexRule *rp, *rp1;
341 dfa_delete (&p->trans.dfa);
343 xfree (p->trans.fastRule);
344 for (rp = p->trans.rules; rp; rp = rp1)
347 actionListDel (&rp->info.actionList);
350 actionListDel (&p->beginActionList);
351 actionListDel (&p->endActionList);
352 xfree (p->f_win_buf);
357 static int readParseToken (const char **cpp, int *len)
359 const char *cp = *cpp;
363 while (*cp == ' ' || *cp == '\t' || *cp == '\n')
392 if (*cp >= 'a' && *cp <= 'z')
394 else if (*cp >= 'A' && *cp <= 'Z')
395 cmd[i] = *cp + 'a' - 'A';
398 if (i > sizeof(cmd)-2)
406 logf (LOG_WARN, "bad character %d %c", *cp, *cp);
408 while (*cp && *cp != ' ' && *cp != '\t' && *cp != '\n')
414 if (!strcmp (cmd, "begin"))
416 else if (!strcmp (cmd, "end"))
418 else if (!strcmp (cmd, "body"))
422 logf (LOG_WARN, "bad command %s", cmd);
428 static int actionListMk (struct lexSpec *spec, const char *s,
429 struct lexRuleAction **ap)
434 while ((tok = readParseToken (&s, &len)))
442 *ap = xmalloc (sizeof(**ap));
444 regxCodeMk (&(*ap)->u.code, s, len);
448 *ap = xmalloc (sizeof(**ap));
450 (*ap)->u.pattern.body = bodyMark;
452 (*ap)->u.pattern.dfa = lexSpecDFA ();
453 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
458 logf (LOG_WARN, "regular expression error. r=%d", r);
461 dfa_mkstate ((*ap)->u.pattern.dfa);
465 logf (LOG_WARN, "cannot use begin here");
468 *ap = xmalloc (sizeof(**ap));
478 int readOneSpec (struct lexSpec *spec, const char *s)
482 tok = readParseToken (&s, &len);
483 if (tok == REGX_BEGIN)
485 actionListDel (&spec->beginActionList);
486 actionListMk (spec, s, &spec->beginActionList);
488 else if (tok == REGX_END)
490 actionListDel (&spec->endActionList);
491 actionListMk (spec, s, &spec->endActionList);
493 else if (tok == REGX_PATTERN)
497 r = dfa_parse (spec->trans.dfa, &s);
500 logf (LOG_WARN, "regular expression error. r=%d", r);
505 logf (LOG_WARN, "expects / at end of pattern. got %c", *s);
509 rp = xmalloc (sizeof(*rp));
510 rp->info.no = spec->trans.ruleNo++;
511 rp->next = spec->trans.rules;
512 spec->trans.rules = rp;
513 actionListMk (spec, s, &rp->info.actionList);
518 int readFileSpec (struct lexSpec *spec)
523 int c, i, errors = 0;
526 lineBuf = xmalloc (1+lineSize);
527 logf (LOG_LOG, "reading regx filter %s.flt", spec->name);
528 sprintf (lineBuf, "%s.flt", spec->name);
529 if (!(spec_inf = yaz_path_fopen (data1_get_tabpath(spec->dh),
532 logf (LOG_ERRNO|LOG_WARN, "cannot read spec file %s", spec->name);
537 spec->trans.ruleNo = 1;
542 if (c == '#' || c == '\n' || c == ' ' || c == '\t')
544 while (c != '\n' && c != EOF)
563 if (c != ' ' && c != '\t')
572 readOneSpec (spec, lineBuf);
573 spec->lineNo += addLine;
578 spec->trans.fastRule = xmalloc (sizeof(*spec->trans.fastRule) *
580 for (i = 0; i<spec->trans.ruleNo; i++)
581 spec->trans.fastRule[i] = NULL;
582 for (rp = spec->trans.rules; rp; rp = rp->next)
583 spec->trans.fastRule[rp->info.no] = &rp->info;
589 debug_dfa_followpos = 1;
592 dfa_mkstate (spec->trans.dfa);
596 static struct lexSpec *curLexSpec = NULL;
598 static void destroy_data (struct data1_node *n)
600 assert (n->which == DATA1N_data);
601 xfree (n->u.data.data);
604 static void execData (struct lexSpec *spec,
605 data1_node **d1_stack, int *d1_level,
606 const char *ebuf, int elen, int formatted_text)
608 struct data1_node *res, *parent;
610 if (elen == 0) /* shouldn't happen, but it does! */
614 logf (LOG_DEBUG, "execData %.15s ... %.*s", ebuf, 15, ebuf + elen-15);
616 logf (LOG_DEBUG, "execData %.*s", elen, ebuf);
618 logf (LOG_DEBUG, "execData len=%d", elen);
624 parent = d1_stack[*d1_level -1];
626 if ((res=d1_stack[*d1_level]) && res->which == DATA1N_data)
628 if (elen + res->u.data.len <= DATA1_LOCALDATA)
629 memcpy (res->u.data.data + res->u.data.len, ebuf, elen);
632 char *nb = xmalloc (elen + res->u.data.len);
633 memcpy (nb, res->u.data.data, res->u.data.len);
634 memcpy (nb + res->u.data.len, ebuf, elen);
635 if (res->u.data.len > DATA1_LOCALDATA)
636 xfree (res->u.data.data);
637 res->u.data.data = nb;
638 res->destroy = destroy_data;
640 res->u.data.len += elen;
644 res = data1_mk_node (spec->dh, spec->m);
645 res->parent = parent;
646 res->which = DATA1N_data;
647 res->u.data.what = DATA1I_text;
648 res->u.data.len = elen;
649 res->u.data.formatted_text = formatted_text;
650 if (elen > DATA1_LOCALDATA)
652 res->u.data.data = xmalloc (elen);
653 res->destroy = destroy_data;
656 res->u.data.data = res->lbuf;
657 memcpy (res->u.data.data, ebuf, elen);
658 res->root = parent->root;
660 parent->last_child = res;
661 if (d1_stack[*d1_level])
662 d1_stack[*d1_level]->next = res;
665 d1_stack[*d1_level] = res;
669 static void execDataP (struct lexSpec *spec,
670 data1_node **d1_stack, int *d1_level,
671 const char *ebuf, int elen, int formatted_text)
673 execData (spec, d1_stack, d1_level, ebuf, elen, formatted_text);
676 static void variantBegin (struct lexSpec *spec,
677 data1_node **d1_stack, int *d1_level,
678 const char *class_str, int class_len,
679 const char *type_str, int type_len,
680 const char *value_str, int value_len)
682 struct data1_node *parent = d1_stack[*d1_level -1];
683 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
690 logf (LOG_WARN, "in variant begin. No record type defined");
693 if (class_len >= DATA1_MAX_SYMBOL)
694 class_len = DATA1_MAX_SYMBOL-1;
695 memcpy (tclass, class_str, class_len);
696 tclass[class_len] = '\0';
698 if (type_len >= DATA1_MAX_SYMBOL)
699 type_len = DATA1_MAX_SYMBOL-1;
700 memcpy (ttype, type_str, type_len);
701 ttype[type_len] = '\0';
704 logf (LOG_DEBUG, "variant begin %s %s (%d)", tclass, ttype, *d1_level);
708 data1_getvartypebyct(spec->dh, parent->root->u.root.absyn->varset,
712 if (parent->which != DATA1N_variant)
714 res = data1_mk_node (spec->dh, spec->m);
715 res->parent = parent;
716 res->which = DATA1N_variant;
717 res->u.variant.type = 0;
718 res->u.variant.value = 0;
719 res->root = parent->root;
721 parent->last_child = res;
722 if (d1_stack[*d1_level])
723 d1_stack[*d1_level]->next = res;
726 d1_stack[*d1_level] = res;
727 d1_stack[++(*d1_level)] = NULL;
729 for (i = *d1_level-1; d1_stack[i]->which == DATA1N_variant; i--)
730 if (d1_stack[i]->u.variant.type == tp)
737 logf (LOG_DEBUG, "variant node (%d)", *d1_level);
739 parent = d1_stack[*d1_level-1];
740 res = data1_mk_node (spec->dh, spec->m);
741 res->parent = parent;
742 res->which = DATA1N_variant;
743 res->root = parent->root;
744 res->u.variant.type = tp;
746 if (value_len >= DATA1_LOCALDATA)
747 value_len =DATA1_LOCALDATA-1;
748 memcpy (res->lbuf, value_str, value_len);
749 res->lbuf[value_len] = '\0';
751 res->u.variant.value = res->lbuf;
753 parent->last_child = res;
754 if (d1_stack[*d1_level])
755 d1_stack[*d1_level]->next = res;
758 d1_stack[*d1_level] = res;
759 d1_stack[++(*d1_level)] = NULL;
762 static void tagBegin (struct lexSpec *spec,
763 data1_node **d1_stack, int *d1_level,
764 const char *tag, int len)
766 struct data1_node *parent = d1_stack[*d1_level -1];
767 data1_element *elem = NULL;
768 data1_node *partag = get_parent_tag(spec->dh, parent);
770 data1_element *e = NULL;
775 logf (LOG_WARN, "in element begin. No record type defined");
779 res = data1_mk_node (spec->dh, spec->m);
780 res->parent = parent;
781 res->which = DATA1N_tag;
782 res->u.tag.get_bytes = -1;
784 if (len >= DATA1_LOCALDATA)
785 len = DATA1_LOCALDATA-1;
786 memcpy (res->lbuf, tag, len);
787 res->lbuf[len] = '\0';
788 res->u.tag.tag = res->lbuf;
791 logf (LOG_DEBUG, "tag begin %s (%d)", res->u.tag.tag, *d1_level);
793 if (parent->which == DATA1N_variant)
796 if (!(e = partag->u.tag.element))
799 elem = data1_getelementbytagname (spec->dh, d1_stack[0]->u.root.absyn,
801 res->u.tag.element = elem;
802 res->u.tag.node_selected = 0;
803 res->u.tag.make_variantlist = 0;
804 res->u.tag.no_data_requested = 0;
805 res->root = parent->root;
807 parent->last_child = res;
808 if (d1_stack[*d1_level])
809 d1_stack[*d1_level]->next = res;
812 d1_stack[*d1_level] = res;
813 d1_stack[++(*d1_level)] = NULL;
816 static void tagEnd (struct lexSpec *spec,
817 data1_node **d1_stack, int *d1_level,
818 const char *tag, int len)
820 while (*d1_level > 1)
823 if ((d1_stack[*d1_level]->which == DATA1N_tag) &&
825 (strlen(d1_stack[*d1_level]->u.tag.tag) == (size_t) len &&
826 !memcmp (d1_stack[*d1_level]->u.tag.tag, tag, len))))
830 logf (LOG_DEBUG, "tag end (%d)", *d1_level);
835 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
838 struct DFA_state *state = dfa->states[0];
841 unsigned char c_prev = 0;
843 int start_ptr = *pptr;
850 c = f_win_advance (spec, &ptr);
851 if (ptr == F_WIN_EOF)
868 *mptr = start_ptr; /* match starts here */
869 *pptr = last_ptr; /* match end here (+1) */
872 state = dfa->states[0];
877 else if (c >= t->ch[0] && c <= t->ch[1])
879 state = dfa->states[t->to];
884 last_rule = state->rule_no;
889 last_rule = state->rule_nno;
901 static int execTok (struct lexSpec *spec, const char **src,
902 int arg_no, int *arg_start, int *arg_end,
903 const char **tokBuf, int *tokLen)
905 const char *s = *src;
907 while (*s == ' ' || *s == '\t')
911 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
915 while (*s >= '0' && *s <= '9')
916 n = n*10 + (*s++ -'0');
926 *tokBuf = f_win_get (spec, arg_start[n], arg_end[n], tokLen);
932 while (*s && *s != '\"')
934 *tokLen = s - *tokBuf;
939 else if (*s == '\n' || *s == ';')
947 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
949 *tokLen = s - *tokBuf;
956 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != ';')
958 *tokLen = s - *tokBuf;
964 static char *regxStrz (const char *src, int len, char *str)
968 memcpy (str, src, len);
973 static int execCode (struct lexSpec *spec,
974 int arg_no, int *arg_start, int *arg_end, int *pptr,
975 struct regxCode *code,
976 data1_node **d1_stack, int *d1_level)
978 const char *s = code->str;
983 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str, &cmd_len);
990 r = execTok (spec, &s, arg_no, arg_start, arg_end,
994 p = regxStrz (cmd_str, cmd_len, ptmp);
995 if (!strcmp (p, "begin"))
997 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1001 p = regxStrz (cmd_str, cmd_len, ptmp);
1002 if (!strcmp (p, "record"))
1004 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1005 &cmd_str, &cmd_len);
1010 static char absynName[64];
1015 memcpy (absynName, cmd_str, cmd_len);
1016 absynName[cmd_len] = '\0';
1019 logf (LOG_DEBUG, "begin record %s", absynName);
1021 if (!(absyn = data1_get_absyn (spec->dh, absynName)))
1022 logf (LOG_WARN, "Unknown tagset: %s", absynName);
1027 res = data1_mk_node (spec->dh, spec->m);
1028 res->which = DATA1N_root;
1029 res->u.root.type = absynName;
1030 res->u.root.absyn = absyn;
1033 d1_stack[*d1_level] = res;
1034 d1_stack[++(*d1_level)] = NULL;
1037 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1038 &cmd_str, &cmd_len);
1040 else if (!strcmp (p, "element"))
1042 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1043 &cmd_str, &cmd_len);
1046 tagBegin (spec, d1_stack, d1_level, cmd_str, cmd_len);
1047 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1048 &cmd_str, &cmd_len);
1050 else if (!strcmp (p, "variant"))
1053 const char *class_str = NULL;
1055 const char *type_str = NULL;
1057 const char *value_str = NULL;
1058 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1059 &cmd_str, &cmd_len);
1062 class_str = cmd_str;
1063 class_len = cmd_len;
1064 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1065 &cmd_str, &cmd_len);
1071 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1072 &cmd_str, &cmd_len);
1075 value_str = cmd_str;
1076 value_len = cmd_len;
1078 variantBegin (spec, d1_stack, d1_level, class_str, class_len,
1079 type_str, type_len, value_str, value_len);
1082 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1083 &cmd_str, &cmd_len);
1086 else if (!strcmp (p, "end"))
1088 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1089 &cmd_str, &cmd_len);
1092 p = regxStrz (cmd_str, cmd_len, ptmp);
1093 if (!strcmp (p, "record"))
1096 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1097 &cmd_str, &cmd_len);
1099 logf (LOG_DEBUG, "end record");
1103 else if (!strcmp (p, "element"))
1105 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1106 &cmd_str, &cmd_len);
1116 tagEnd (spec, d1_stack, d1_level, cmd_str, cmd_len);
1117 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1118 &cmd_str, &cmd_len);
1121 tagEnd (spec, d1_stack, d1_level, NULL, 0);
1124 logf (LOG_WARN, "missing record/element/variant");
1127 logf (LOG_WARN, "missing record/element/variant");
1129 else if (!strcmp (p, "data"))
1133 const char *element_str = NULL;
1135 while ((r = execTok (spec, &s, arg_no, arg_start, arg_end,
1136 &cmd_str, &cmd_len)) == 3)
1138 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1140 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1142 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1143 &element_str, &element_len);
1148 logf (LOG_WARN, "bad data option: %.*s",
1153 logf (LOG_WARN, "missing data item after data");
1157 tagBegin (spec, d1_stack, d1_level, element_str, element_len);
1160 execData (spec, d1_stack, d1_level, cmd_str, cmd_len,
1162 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1163 &cmd_str, &cmd_len);
1166 tagEnd (spec, d1_stack, d1_level, NULL, 0);
1168 else if (!strcmp (p, "unread"))
1171 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1172 &cmd_str, &cmd_len);
1173 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1175 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1176 &cmd_str, &cmd_len);
1179 logf (LOG_WARN, "missing number after -offset");
1182 p = regxStrz (cmd_str, cmd_len, ptmp);
1184 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1185 &cmd_str, &cmd_len);
1191 logf (LOG_WARN, "missing index after unread command");
1194 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1196 logf (LOG_WARN, "bad index after unread command");
1201 no = *cmd_str - '0';
1204 *pptr = arg_start[no] + offset;
1206 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1207 &cmd_str, &cmd_len);
1211 logf (LOG_WARN, "unknown code command: %.*s", cmd_len, cmd_str);
1212 r = execTok (spec, &s, arg_no, arg_start, arg_end,
1213 &cmd_str, &cmd_len);
1218 logf (LOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1220 r = execTok (spec, &s, arg_no, arg_start, arg_end, &cmd_str,
1229 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1230 data1_node **d1_stack, int *d1_level,
1231 int start_ptr, int *pptr)
1238 arg_start[0] = start_ptr;
1246 if (ap->u.pattern.body)
1248 arg_start[arg_no] = *pptr;
1249 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1251 arg_end[arg_no] = F_WIN_EOF;
1253 arg_start[arg_no] = F_WIN_EOF;
1254 arg_end[arg_no] = F_WIN_EOF;
1259 arg_end[arg_no] = sptr;
1261 arg_start[arg_no] = sptr;
1262 arg_end[arg_no] = *pptr;
1267 arg_start[arg_no] = *pptr;
1268 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa))
1270 if (sptr != arg_start[arg_no])
1272 arg_end[arg_no] = *pptr;
1277 if (!execCode (spec, arg_no, arg_start, arg_end, pptr,
1278 ap->u.code, d1_stack, d1_level))
1282 arg_start[arg_no] = *pptr;
1283 arg_end[arg_no] = F_WIN_EOF;
1292 static int execRule (struct lexSpec *spec, struct lexTrans *trans,
1293 data1_node **d1_stack, int *d1_level,
1294 int ruleNo, int start_ptr, int *pptr)
1297 logf (LOG_DEBUG, "execRule %d", ruleNo);
1299 return execAction (spec, trans->fastRule[ruleNo]->actionList,
1300 d1_stack, d1_level, start_ptr, pptr);
1303 data1_node *lexNode (struct lexSpec *spec, struct lexTrans *trans,
1304 data1_node **d1_stack, int *d1_level,
1307 struct DFA_state *state = trans->dfa->states[0];
1310 unsigned char c_prev = '\n';
1313 int last_ptr = *ptr;
1314 int start_ptr = *ptr;
1315 int skip_ptr = *ptr;
1319 c = f_win_advance (spec, ptr);
1320 if (*ptr == F_WIN_EOF)
1324 if (skip_ptr < start_ptr)
1328 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1329 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1332 if (!execRule (spec, trans, d1_stack, d1_level, last_rule,
1338 else if (skip_ptr < *ptr)
1342 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1343 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1345 if (*ptr == F_WIN_EOF)
1352 { /* no transition for character c ... */
1355 if (skip_ptr < start_ptr)
1359 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1360 execDataP (spec, d1_stack, d1_level, buf, size, 0);
1363 if (!execRule (spec, trans, d1_stack, d1_level, last_rule,
1366 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1369 logf (LOG_DEBUG, "regx: endf ptr=%d", *ptr);
1371 (*spec->f_win_ef)(spec->f_win_fh, *ptr);
1381 c_prev = f_win_advance (spec, &start_ptr);
1386 c_prev = f_win_advance (spec, &start_ptr);
1389 state = trans->dfa->states[0];
1392 else if (c >= t->ch[0] && c <= t->ch[1])
1393 { /* transition ... */
1394 state = trans->dfa->states[t->to];
1399 last_rule = state->rule_no;
1402 else if (state->rule_nno)
1404 last_rule = state->rule_nno;
1416 static data1_node *lexRoot (struct lexSpec *spec, off_t offset)
1418 data1_node *d1_stack[512];
1422 d1_stack[d1_level] = NULL;
1423 if (spec->beginActionList)
1424 execAction (spec, spec->beginActionList,
1425 d1_stack, &d1_level, 0, &ptr);
1426 lexNode (spec, &spec->trans, d1_stack, &d1_level, &ptr);
1427 if (spec->endActionList)
1428 execAction (spec, spec->endActionList,
1429 d1_stack, &d1_level, ptr, &ptr);
1433 data1_node *grs_read_regx (struct grs_read_info *p)
1438 logf (LOG_DEBUG, "grs_read_regx");
1440 if (!curLexSpec || strcmp (curLexSpec->name, p->type))
1443 lexSpecDel (&curLexSpec);
1444 curLexSpec = lexSpecMk (p->type);
1445 curLexSpec->dh = p->dh;
1446 res = readFileSpec (curLexSpec);
1449 lexSpecDel (&curLexSpec);
1455 curLexSpec->f_win_start = 0;
1456 curLexSpec->f_win_end = 0;
1457 curLexSpec->f_win_rf = p->readf;
1458 curLexSpec->f_win_sf = p->seekf;
1459 curLexSpec->f_win_fh = p->fh;
1460 curLexSpec->f_win_ef = p->endf;
1461 curLexSpec->f_win_size = 500000;
1463 curLexSpec->m = p->mem;
1464 return lexRoot (curLexSpec, p->offset);