2 * Copyright (C) 1994, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.8 1995-12-06 09:09:58 adam
8 * Work on left and right anchors.
10 * Revision 1.7 1995/11/27 09:23:02 adam
11 * New berbatim hook in regular expressions. "[]n ..".
13 * Revision 1.6 1995/10/16 09:31:25 adam
16 * Revision 1.5 1995/10/02 15:17:58 adam
17 * Bug fix in dfa_delete.
19 * Revision 1.4 1995/09/28 09:18:52 adam
20 * Removed various preprocessor defines.
22 * Revision 1.3 1995/09/04 12:33:26 adam
23 * Various cleanup. YAZ util used instead.
25 * Revision 1.2 1995/01/25 11:30:50 adam
26 * Simple error reporting when parsing regular expressions.
27 * Memory usage reduced.
29 * Revision 1.1 1995/01/24 16:02:52 adam
30 * New private header file in dfa module (dfap.h).
31 * Module no longer uses yacc to parse regular expressions.
53 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
54 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
55 /* a character in range [ch[0]..ch[1]] */
56 /* otherwise ch[0] represents */
57 /* accepting no (-acceptno) */
59 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
60 unsigned nullable : 1;
61 Set firstpos; /* first positions */
62 Set lastpos; /* last positions */
65 struct Tblock { /* Tnode bucket (block) */
66 struct Tblock *next; /* pointer to next bucket */
67 struct Tnode *tarray; /* array of nodes in bucket */
71 #define STATE_HASH 199
72 #define POSET_CHUNK 100
74 int debug_dfa_trav = 0;
75 int debug_dfa_tran = 0;
76 int debug_dfa_followpos = 0;
79 static struct DFA_parse *parse_info = NULL;
82 static int inside_string;
83 static const unsigned char *expr_ptr;
84 static int expr_verbatim;
85 static unsigned short *ctrl_chars;
86 static struct Tnode **posar;
89 static Set *followpos;
91 static struct Tnode *mk_Tnode (void);
92 static struct Tnode *mk_Tnode_cset (BSet charset);
93 static void term_Tnode (void);
99 mk_dfa_tran (struct DFA_states *dfas),
100 add_follow (Set lastpos, Set firstpos),
101 dfa_trav (struct Tnode *n),
102 init_followpos (void),
103 pr_tran (struct DFA_states *dfas),
104 pr_verbose (struct DFA_states *dfas),
114 *str_char (unsigned c);
130 static int lookahead;
131 static unsigned look_ch;
132 static BSet look_chars;
134 static struct Tnode *expr_1 (void),
139 static struct Tnode *expr_1 (void)
141 struct Tnode *t1, *t2, *tn;
143 if (!(t1 = expr_2 ()))
145 while (lookahead == L_ALT)
148 if (!(t2 = expr_2 ()))
160 static struct Tnode *expr_2 (void)
162 struct Tnode *t1, *t2, *tn;
164 if (!(t1 = expr_3 ()))
166 while (lookahead == L_WILD ||
167 lookahead == L_ANYZ ||
168 lookahead == L_ANY ||
170 lookahead == L_CHAR ||
171 lookahead == L_CHARS)
173 if (!(t2 = expr_3 ()))
185 static struct Tnode *expr_3 (void)
187 struct Tnode *t1, *tn;
189 if (!(t1 = expr_4 ()))
191 if (lookahead == L_CLOS0)
199 else if (lookahead == L_CLOS1)
207 else if (lookahead == L_QUEST)
213 tn->u.p[1] = mk_Tnode();
214 tn->u.p[1]->pos = EPSILON;
220 static struct Tnode *expr_4 (void)
228 if (!(t1 = expr_1 ()))
230 if (lookahead == L_RP)
237 t1->pos = ++parse_info->position;
238 t1->u.ch[1] = t1->u.ch[0] = look_ch;
242 t1 = mk_Tnode_cset (look_chars);
246 t1 = mk_Tnode_cset (parse_info->anyset);
252 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
253 t1->u.p[1] = mk_Tnode();
254 t1->u.p[1]->pos = EPSILON;
260 t1->u.p[0] = mk_Tnode_cset (parse_info->anyset);
268 static void do_parse (struct DFA_parse *dfap, char **s,
269 const unsigned short *cc, struct Tnode **tnp)
273 struct Tnode *t1, *t2, *tn;
275 for (i=0; cc[i]; i +=2)
277 ctrl_chars = imalloc ((i+2) * sizeof(*ctrl_chars));
278 for (i=0; (ctrl_chars[i] = cc[i]); i ++)
282 ctrl_chars[i] = L_LP;
285 ctrl_chars[i] = L_RP;
288 ctrl_chars[i] = L_ANY;
291 ctrl_chars[i] = L_ALT;
294 ctrl_chars[i] = L_QUEST;
297 ctrl_chars[i] = L_CLOS1;
300 ctrl_chars[i] = L_CLOS0;
303 ctrl_chars[i] = L_END;
306 ctrl_chars[i] = L_START;
309 ctrl_chars[i] = L_ANY;
312 ctrl_chars[i] = L_ANYZ;
315 ctrl_chars[i] = L_WILD;
322 expr_ptr = (unsigned char *) *s;
327 if (lookahead == L_START)
330 t2->pos = ++parse_info->position;
331 t2->u.ch[1] = t2->u.ch[0] = '\n';
344 if (lookahead == L_END && t1)
347 t2->pos = ++parse_info->position;
348 t2->u.ch[1] = t2->u.ch[0] = '\n';
359 if (lookahead == 0 && t1)
362 t2->pos = ++parse_info->position;
363 t2->u.ch[0] = -(++parse_info->rule);
364 t2->u.ch[1] = anchor_flag;
375 if (lookahead == L_RP)
376 err_code = DFA_ERR_RP;
377 else if (lookahead == L_LP)
378 err_code = DFA_ERR_LP;
380 err_code = DFA_ERR_SYNTAX;
383 *s = (char *) expr_ptr;
387 static int nextchar (int *esc)
390 if (*expr_ptr == '\0' || isspace(*expr_ptr))
392 else if (*expr_ptr != '\\' || expr_verbatim)
394 if (*expr_ptr == '[' && expr_ptr[1] == ']' && !expr_verbatim)
398 while (expr_ptr[i] >= '0' && expr_ptr[i] <= '9')
399 val = val*10 + expr_ptr[i++]-'0';
402 if (expr_ptr[i] == ' ')
410 assert (expr_verbatim > 0);
442 static int nextchar_set (int *esc)
444 if (*expr_ptr == ' ')
449 return nextchar (esc);
452 static int read_charset (void)
454 int i, ch0, ch1, esc0, esc1, cc = 0;
455 look_chars = mk_BSet (&parse_info->charset);
456 res_BSet (parse_info->charset, look_chars);
458 ch0 = nextchar_set (&esc0);
459 if (!esc0 && ch0 == '^')
462 ch0 = nextchar_set (&esc0);
466 if (!esc0 && ch0 == ']')
468 add_BSet (parse_info->charset, look_chars, ch0);
469 ch1 = nextchar_set (&esc1);
470 if (!esc1 && ch1 == '-')
472 if ((ch1 = nextchar_set (&esc1)) == 0)
474 if (!esc1 && ch1 == ']')
476 add_BSet (parse_info->charset, look_chars, '-');
479 for (i=ch0; ++i<=ch1;)
480 add_BSet (parse_info->charset, look_chars, i);
481 ch0 = nextchar_set (&esc0);
490 com_BSet (parse_info->charset, look_chars);
494 unsigned short dfa_thompson_chars[] =
508 unsigned short dfa_ccl_chars[] =
516 static int lex_sub(void)
519 const unsigned short *cc;
520 while ((look_ch = nextchar (&esc)) != 0)
525 inside_string = !inside_string;
527 else if (esc || inside_string)
529 else if (look_ch == '[')
530 return read_charset();
531 else if (look_ch == ' ')
535 for (cc = ctrl_chars; *cc; cc += 2)
543 static void lex (void)
545 lookahead = lex_sub ();
548 static const char *str_char (unsigned c)
565 sprintf (s+1, "x%02x", c);
587 static void out_char (int c)
589 printf ("%s", str_char (c));
592 static void term_Tnode (void)
594 struct Tblock *t, *t1;
595 for (t = parse_info->start; (t1 = t);)
603 static struct Tnode *mk_Tnode (void)
607 if (parse_info->use_Tnode == parse_info->max_Tnode)
609 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
610 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
613 if (parse_info->use_Tnode == 0)
614 parse_info->start = tnew;
616 parse_info->end->next = tnew;
617 parse_info->end = tnew;
619 parse_info->max_Tnode += TADD;
621 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
624 struct Tnode *mk_Tnode_cset (BSet charset)
626 struct Tnode *tn1, *tn0 = mk_Tnode();
627 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
633 tn0->pos = ++parse_info->position;
634 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
636 tn0->u.ch[1] = parse_info->charset->size;
639 tn0->u.ch[1] = ch1-1;
640 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
646 tn1 = tn0->u.p[1] = mk_Tnode();
648 tn1->pos = ++(parse_info->position);
649 if ((ch1 = travi_BSet (parse_info->charset, charset,
652 tn1->u.ch[1] = parse_info->charset->size;
655 tn1->u.ch[1] = ch1-1;
662 static void del_followpos (void)
667 static void init_pos (void)
669 posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
670 * (1+parse_info->position));
673 static void del_pos (void)
678 static void add_follow (Set lastpos, Set firstpos)
682 followpos[lastpos->value] =
683 union_Set (poset, followpos[lastpos->value], firstpos);
684 lastpos = lastpos->next;
688 static void dfa_trav (struct Tnode *n)
693 dfa_trav (n->u.p[0]);
694 dfa_trav (n->u.p[1]);
695 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
696 n->firstpos = mk_Set (poset);
697 n->firstpos = union_Set (poset, n->firstpos, n->u.p[0]->firstpos);
698 if (n->u.p[0]->nullable)
699 n->firstpos = union_Set (poset, n->firstpos, n->u.p[1]->firstpos);
700 n->lastpos = mk_Set (poset);
701 n->lastpos = union_Set (poset, n->lastpos, n->u.p[1]->lastpos);
702 if (n->u.p[1]->nullable)
703 n->lastpos = union_Set (poset, n->lastpos, n->u.p[0]->lastpos);
705 add_follow (n->u.p[0]->lastpos, n->u.p[1]->firstpos);
707 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
708 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
709 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
710 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
715 dfa_trav (n->u.p[0]);
716 dfa_trav (n->u.p[1]);
717 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
719 n->firstpos = merge_Set (poset, n->u.p[0]->firstpos,
720 n->u.p[1]->firstpos);
721 n->lastpos = merge_Set (poset, n->u.p[0]->lastpos,
723 n->u.p[0]->firstpos = rm_Set (poset, n->u.p[0]->firstpos);
724 n->u.p[0]->lastpos = rm_Set (poset, n->u.p[0]->lastpos);
725 n->u.p[1]->firstpos = rm_Set (poset, n->u.p[1]->firstpos);
726 n->u.p[1]->lastpos = rm_Set (poset, n->u.p[1]->lastpos);
731 dfa_trav (n->u.p[0]);
732 n->nullable = n->u.p[0]->nullable;
733 n->firstpos = n->u.p[0]->firstpos;
734 n->lastpos = n->u.p[0]->lastpos;
735 add_follow (n->lastpos, n->firstpos);
740 dfa_trav (n->u.p[0]);
742 n->firstpos = n->u.p[0]->firstpos;
743 n->lastpos = n->u.p[0]->lastpos;
744 add_follow (n->lastpos, n->firstpos);
750 n->lastpos = mk_Set (poset);
751 n->firstpos = mk_Set (poset);
758 n->firstpos = mk_Set (poset);
759 n->firstpos = add_Set (poset, n->firstpos, n->pos);
760 n->lastpos = mk_Set (poset);
761 n->lastpos = add_Set (poset, n->lastpos, n->pos);
764 printf ("#%d", -n->u.ch[0]);
765 else if (n->u.ch[1] > n->u.ch[0])
768 out_char (n->u.ch[0]);
769 if (n->u.ch[1] > n->u.ch[0]+1)
771 out_char (n->u.ch[1]);
775 out_char (n->u.ch[0]);
779 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
780 printf (" firstpos :");
781 pr_Set (poset, n->firstpos);
782 printf (" lastpos :");
783 pr_Set (poset, n->lastpos);
787 static void init_followpos (void)
791 followpos = fa = (Set *) imalloc (sizeof(Set) * (1+parse_info->position));
792 for (i = parse_info->position+1; --i >= 0; fa++)
793 *fa = mk_Set (poset);
796 static void mk_dfa_tran (struct DFA_states *dfas)
803 struct DFA_state *dfa_from, *dfa_to;
806 max_char = parse_info->charset->size;
807 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
809 tran_set = cp_Set (poset, parse_info->root->firstpos);
810 i = add_DFA_state (dfas, &tran_set, &dfa_from);
812 dfa_from->rule_no = 0;
813 while ((dfa_from = get_DFA_state (dfas)))
817 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
818 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
819 *pos_i++ = tran_set->value;
820 else if (c < 0 && (i == 0 || c > i))
823 dfa_from->rule_no = -i;
825 for (char_1 = 0; char_1 <= max_char; char_1++)
828 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
829 if (posar[i]->u.ch[1] >= char_1
830 && (c=posar[i]->u.ch[0]) < char_0)
836 if (char_0 > max_char)
841 tran_set = mk_Set (poset);
842 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
844 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
845 char_1 = c - 1; /* forward chunk */
846 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
847 char_1 = c; /* backward chunk */
848 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
849 tran_set = union_Set (poset, tran_set, followpos[i]);
853 add_DFA_state (dfas, &tran_set, &dfa_to);
854 add_DFA_tran (dfas, dfa_from, char_0, char_1, dfa_to->no);
859 sort_DFA_states (dfas);
862 static void pr_tran (struct DFA_states *dfas)
865 struct DFA_tran *tran;
866 int prev_no, i, c, no;
868 for (no=0; no < dfas->no; ++no)
870 s = dfas->sortarray[no];
871 assert (s->no == no);
872 printf ("DFA state %d", no);
874 printf ("#%d", s->rule_no);
876 pr_Set (poset, s->set);
878 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
880 if (prev_no != tran->to)
884 printf (" goto %d on [", tran->to);
887 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
888 printf ("%s", str_char(c));
896 static void pr_verbose (struct DFA_states *dfas)
900 printf ("%d/%d tree nodes used, %d bytes each\n",
901 parse_info->use_Tnode, parse_info->max_Tnode, sizeof (struct Tnode));
902 k = inf_BSetHandle (parse_info->charset, &i, &j);
903 printf ("%ld/%ld character sets, %d bytes each\n",
904 i/k, j/k, k*sizeof(BSetWord));
905 k = inf_SetType (poset, &i, &j);
906 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
907 printf ("%d DFA states\n", dfas->no);
910 static void pr_followpos (void)
913 printf ("\nfollowsets:\n");
914 for (i=1; i <= parse_info->position; i++)
917 pr_Set (poset, followpos[i]);
920 if (posar[i]->u.ch[0] < 0)
921 printf ("#%d", -posar[i]->u.ch[0]);
922 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
925 out_char (posar[i]->u.ch[0]);
926 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
928 out_char (posar[i]->u.ch[1]);
932 out_char (posar[i]->u.ch[0]);
938 static struct DFA_parse *dfa_parse_init (void)
940 parse_info = (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
942 parse_info->charset = mk_BSetHandle (255, 20);
943 parse_info->position = 0;
944 parse_info->rule = 0;
945 parse_info->root = NULL;
947 parse_info->anyset = mk_BSet (&parse_info->charset);
948 res_BSet (parse_info->charset, parse_info->anyset);
949 add_BSet (parse_info->charset, parse_info->anyset, '\n');
950 com_BSet (parse_info->charset, parse_info->anyset);
951 parse_info->use_Tnode = parse_info->max_Tnode = 0;
955 static void rm_dfa_parse (struct DFA_parse **dfap)
960 rm_BSetHandle (&parse_info->charset);
965 static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
967 struct DFA_states *dfas;
969 assert (poset_chunk > 10);
973 poset = mk_SetType (poset_chunk);
976 dfa_trav (parse_info->root);
978 if (debug_dfa_followpos)
980 init_DFA_states (&dfas, poset, STATE_HASH);
992 struct DFA *dfa_init (void)
996 dfa = imalloc (sizeof(*dfa));
997 dfa->parse_info = dfa_parse_init ();
998 dfa->state_info = NULL;
1003 int dfa_parse (struct DFA *dfa, char **pattern)
1008 do_parse (dfa->parse_info, pattern, dfa_thompson_chars, &top);
1011 if ( !dfa->parse_info->root )
1012 dfa->parse_info->root = top;
1019 n->u.p[0] = dfa->parse_info->root;
1021 dfa->parse_info->root = n;
1026 void dfa_mkstate (struct DFA *dfa)
1030 dfa->state_info = mk_dfas (dfa->parse_info, POSET_CHUNK);
1031 dfa->no_states = dfa->state_info->no;
1032 dfa->states = dfa->state_info->sortarray;
1033 rm_dfa_parse (&dfa->parse_info);
1036 void dfa_delete (struct DFA **dfap)
1040 if ((*dfap)->parse_info)
1041 rm_dfa_parse (&(*dfap)->parse_info);
1042 if ((*dfap)->state_info)
1043 rm_DFA_states (&(*dfap)->state_info);