* Sebastian Hammer, Adam Dickmeiss
*
* $Log: dfa.c,v $
- * Revision 1.3 1995-09-04 12:33:26 adam
+ * Revision 1.11 1996-01-08 19:15:24 adam
+ * Allow single $ in expressions.
+ *
+ * Revision 1.10 1996/01/08 09:09:17 adam
+ * Function dfa_parse got 'const' string argument.
+ * New functions to define char mappings made public.
+ *
+ * Revision 1.9 1995/12/06 12:24:58 adam
+ * Removed verbatim mode code.
+ *
+ * Revision 1.8 1995/12/06 09:09:58 adam
+ * Work on left and right anchors.
+ *
+ * Revision 1.7 1995/11/27 09:23:02 adam
+ * New berbatim hook in regular expressions. "[]n ..".
+ *
+ * Revision 1.6 1995/10/16 09:31:25 adam
+ * Bug fix.
+ *
+ * Revision 1.5 1995/10/02 15:17:58 adam
+ * Bug fix in dfa_delete.
+ *
+ * Revision 1.4 1995/09/28 09:18:52 adam
+ * Removed various preprocessor defines.
+ *
+ * Revision 1.3 1995/09/04 12:33:26 adam
* Various cleanup. YAZ util used instead.
*
* Revision 1.2 1995/01/25 11:30:50 adam
int debug_dfa_tran = 0;
int debug_dfa_followpos = 0;
int dfa_verbose = 0;
-int yydebug = 0;
static struct DFA_parse *parse_info = NULL;
static int err_code;
static int inside_string;
static const unsigned char *expr_ptr;
-static unsigned short *ctrl_chars;
static struct Tnode **posar;
static SetType poset;
add_follow (Set lastpos, Set firstpos),
dfa_trav (struct Tnode *n),
init_followpos (void),
- mk_dfa_tran (struct DFA_states *dfas),
pr_tran (struct DFA_states *dfas),
pr_verbose (struct DFA_states *dfas),
pr_followpos (void),
break;
case L_CHAR:
t1 = mk_Tnode();
- t1->pos = ++(parse_info->position);
+ t1->pos = ++parse_info->position;
t1->u.ch[1] = t1->u.ch[0] = look_ch;
lex ();
break;
return t1;
}
-static void do_parse (dfap, s, cc, tnp)
-struct DFA_parse *dfap;
-char **s;
-const unsigned short *cc;
-struct Tnode **tnp;
+static void do_parse (struct DFA_parse *dfap, const char **s,
+ struct Tnode **tnp)
{
- int i;
- struct Tnode *t1, *t2;
+ int start_anchor_flag = 0;
+ struct Tnode *t1, *t2, *tn;
- for (i=0; cc[i]; i +=2)
- ;
- ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars));
- for (i=0; (ctrl_chars[i] = cc[i]); i ++)
- switch (cc[++i])
- {
- case '(':
- ctrl_chars[i] = L_LP;
- break;
- case ')':
- ctrl_chars[i] = L_RP;
- break;
- case '.':
- ctrl_chars[i] = L_ANY;
- break;
- case '|':
- ctrl_chars[i] = L_ALT;
- break;
- case '?':
- ctrl_chars[i] = L_QUEST;
- break;
- case '+':
- ctrl_chars[i] = L_CLOS1;
- break;
- case '*':
- ctrl_chars[i] = L_CLOS0;
- break;
- case '$':
- ctrl_chars[i] = L_END;
- break;
- case '^':
- ctrl_chars[i] = L_START;
- break;
- case '!':
- ctrl_chars[i] = L_ANY;
- break;
- case '#':
- ctrl_chars[i] = L_ANYZ;
- break;
- case '@':
- ctrl_chars[i] = L_WILD;
- break;
- default:
- ctrl_chars[i] = 0;
- }
parse_info = dfap;
err_code = 0;
- expr_ptr = (unsigned char *) *s;
+ expr_ptr = (const unsigned char *) *s;
inside_string = 0;
lex ();
- t1 = expr_1 ();
+ if (lookahead == L_START)
+ {
+ start_anchor_flag = 1;
+ lex ();
+ }
+ if (lookahead == L_END)
+ {
+ t1 = mk_Tnode ();
+ t1->pos = ++parse_info->position;
+ t1->u.ch[1] = t1->u.ch[0] = '\n';
+ lex ();
+ }
+ else
+ {
+ t1 = expr_1 ();
+ if (t1 && lookahead == L_END)
+ {
+ t2 = mk_Tnode ();
+ t2->pos = ++parse_info->position;
+ t2->u.ch[1] = t2->u.ch[0] = '\n';
+
+ tn = mk_Tnode ();
+ tn->pos = CAT;
+ tn->u.p[0] = t1;
+ tn->u.p[1] = t2;
+ t1 = tn;
+
+ lex ();
+ }
+ }
if (t1 && lookahead == 0)
{
t2 = mk_Tnode();
t2->pos = ++parse_info->position;
t2->u.ch[0] = -(++parse_info->rule);
+ t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);
*tnp = mk_Tnode();
(*tnp)->pos = CAT;
err_code = DFA_ERR_SYNTAX;
}
}
- *s = (char *) expr_ptr;
- ifree (ctrl_chars);
+ *s = (const char *) expr_ptr;
}
static int nextchar (int *esc)
{
*esc = 0;
- if (*expr_ptr == '\0' || isspace(*expr_ptr))
+ if (*expr_ptr == '\0')
return 0;
else if (*expr_ptr != '\\')
return *expr_ptr++;
{
case '\r':
case '\n':
- case '\t':
case '\0':
return '\\';
+ case '\t':
+ ++expr_ptr;
+ return ' ';
case 'n':
++expr_ptr;
return '\n';
return L_CHARS;
}
-unsigned short dfa_thompson_chars[] =
-{
- '*', '*',
- '+', '+',
- '|', '|',
- '^', '^',
- '$', '$',
- '?', '?',
- '.', '.',
- '(', '(',
- ')', ')',
- 0
-};
-
-unsigned short dfa_ccl_chars[] =
-{
- '#', '#',
- '!', '!',
- '?', '@',
- 0
-};
-
static int lex_sub(void)
{
int esc;
- const unsigned short *cc;
while ((look_ch = nextchar (&esc)) != 0)
if (look_ch == '\"')
{
return L_CHAR;
else if (look_ch == '[')
return read_charset();
- else if (look_ch == ' ')
- return 0;
- else
+ else
{
- for (cc = ctrl_chars; *cc; cc += 2)
+ const int *cc;
+ for (cc = parse_info->charMap; *cc; cc += 2)
if (*cc == look_ch)
+ {
+ if (!cc[1])
+ --expr_ptr;
return cc[1];
+ }
return L_CHAR;
}
return 0;
n->lastpos = add_Set (poset, n->lastpos, n->pos);
if (debug_dfa_trav)
if (n->u.ch[0] < 0)
- printf ("#%d", -n->u.ch[0]);
+ printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]);
else if (n->u.ch[1] > n->u.ch[0])
{
putchar ('[');
static void mk_dfa_tran (struct DFA_states *dfas)
{
- int i, c;
+ int i, j, c;
int max_char;
short *pos, *pos_i;
Set tran_set;
while ((dfa_from = get_DFA_state (dfas)))
{
pos_i = pos;
- i = 0;
+ j = i = 0;
for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
*pos_i++ = tran_set->value;
- else if (c < 0 && (i == 0 || c > i))
- i = c;
+ else if (c < 0)
+ {
+ if (i == 0 || c > i)
+ i = c;
+ c = posar[tran_set->value]->u.ch[1];
+ if (j == 0 || c > j)
+ j = c;
+ }
*pos_i = -1;
dfa_from->rule_no = -i;
+ dfa_from->rule_nno = -j;
for (char_1 = 0; char_1 <= max_char; char_1++)
{
char_0 = max_char+1;
for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
- if (posar[i]->u.ch[1] >= char_1)
- if ((c=posar[i]->u.ch[0]) < char_0)
- if (c < char_1)
- char_0 = char_1;
- else
- char_0 = c;
+ if (posar[i]->u.ch[1] >= char_1
+ && (c=posar[i]->u.ch[0]) < char_0)
+ if (c < char_1)
+ char_0 = char_1;
+ else
+ char_0 = c;
- char_1 = max_char;
if (char_0 > max_char)
break;
+
+ char_1 = max_char;
+
tran_set = mk_Set (poset);
for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
- if ((c=posar[i]->u.ch[1]) >= char_0)
- if (posar[i]->u.ch[0] <= char_0)
- {
- if (c < char_1)
- char_1 = c;
- tran_set = union_Set (poset, tran_set, followpos[i]);
- }
- else if (c <= char_1)
- char_1 = c-1;
+ {
+ if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
+ char_1 = c - 1; /* forward chunk */
+ else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
+ char_1 = c; /* backward chunk */
+ if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
+ tran_set = union_Set (poset, tran_set, followpos[i]);
+ }
if (tran_set)
{
add_DFA_state (dfas, &tran_set, &dfa_to);
printf ("DFA state %d", no);
if (s->rule_no)
printf ("#%d", s->rule_no);
+ if (s->rule_nno)
+ printf (" #%d", s->rule_nno);
+
putchar (':');
pr_Set (poset, s->set);
prev_no = -1;
putchar ('\n');
}
+void dfa_parse_cmap_clean (struct DFA *d)
+{
+ struct DFA_parse *dfa = d->parse_info;
+
+ assert (dfa);
+ if (!dfa->charMap)
+ {
+ dfa->charMapSize = 7;
+ dfa->charMap = imalloc (dfa->charMapSize * sizeof(*dfa->charMap));
+ }
+ dfa->charMap[0] = 0;
+}
+
+void dfa_parse_cmap_new (struct DFA *d, const int *cmap)
+{
+ struct DFA_parse *dfa = d->parse_info;
+ const int *cp;
+ int size;
+
+ assert (dfa);
+ for (cp = cmap; *cp; cp += 2)
+ ;
+ size = cp - cmap + 1;
+ if (size > dfa->charMapSize)
+ {
+ if (dfa->charMap)
+ ifree (dfa->charMap);
+ dfa->charMapSize = size;
+ dfa->charMap = imalloc (size * sizeof(*dfa->charMap));
+ }
+ memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap));
+}
+
+void dfa_parse_cmap_del (struct DFA *d, int from)
+{
+ struct DFA_parse *dfa = d->parse_info;
+ int *cc;
+
+ assert (dfa);
+ for (cc = dfa->charMap; *cc; cc += 2)
+ if (*cc == from)
+ {
+ while ((cc[0] = cc[2]))
+ {
+ cc[1] = cc[3];
+ cc += 2;
+ }
+ break;
+ }
+}
+
+void dfa_parse_cmap_add (struct DFA *d, int from, int to)
+{
+ struct DFA_parse *dfa = d->parse_info;
+ int *cc;
+ int indx, size;
+
+ assert (dfa);
+ for (cc = dfa->charMap; *cc; cc += 2)
+ if (*cc == from)
+ {
+ cc[1] = to;
+ return ;
+ }
+ indx = cc - dfa->charMap;
+ size = dfa->charMapSize;
+ if (indx >= size)
+ {
+ int *cn = imalloc ((size+16) * sizeof(*dfa->charMap));
+ memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap));
+ ifree (dfa->charMap);
+ dfa->charMap = cn;
+ dfa->charMapSize = size+16;
+ }
+ dfa->charMap[indx] = from;
+ dfa->charMap[indx+1] = to;
+ dfa->charMap[indx+2] = 0;
+}
+
+void dfa_parse_cmap_thompson (struct DFA *d)
+{
+ static int thompson_chars[] =
+ {
+ '*', L_CLOS0,
+ '+', L_CLOS1,
+ '|', L_ALT,
+ '^', L_START,
+ '$', L_END,
+ '?', L_QUEST,
+ '.', L_ANY,
+ '(', L_LP,
+ ')', L_RP,
+ ' ', 0,
+ '\t', 0,
+ '\n', 0,
+ 0
+ };
+ dfa_parse_cmap_new (d, thompson_chars);
+}
+
static struct DFA_parse *dfa_parse_init (void)
{
parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
add_BSet (parse_info->charset, parse_info->anyset, '\n');
com_BSet (parse_info->charset, parse_info->anyset);
parse_info->use_Tnode = parse_info->max_Tnode = 0;
+ parse_info->charMap = NULL;
+ parse_info->charMapSize = 0;
return parse_info;
}
assert (parse_info);
term_Tnode();
rm_BSetHandle (&parse_info->charset);
+ ifree (parse_info->charMap);
ifree (parse_info);
*dfap = NULL;
}
poset = mk_SetType (poset_chunk);
init_pos();
init_followpos();
+ assert (parse_info->root);
dfa_trav (parse_info->root);
if (debug_dfa_followpos)
dfa->parse_info = dfa_parse_init ();
dfa->state_info = NULL;
dfa->states = NULL;
+ dfa_parse_cmap_thompson (dfa);
return dfa;
}
-int dfa_parse (struct DFA *dfa, char **pattern)
+int dfa_parse (struct DFA *dfa, const char **pattern)
{
struct Tnode *top;
assert (dfa);
- do_parse (dfa->parse_info, pattern, dfa_thompson_chars, &top);
+ assert (dfa->parse_info);
+ do_parse (dfa->parse_info, pattern, &top);
if (err_code)
return err_code;
if ( !dfa->parse_info->root )
assert (*dfap);
if ((*dfap)->parse_info)
rm_dfa_parse (&(*dfap)->parse_info);
- rm_DFA_states (&(*dfap)->state_info);
+ if ((*dfap)->state_info)
+ rm_DFA_states (&(*dfap)->state_info);
ifree (*dfap);
*dfap = NULL;
}