/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
- * $Id: nfa.h,v 1.2 2006-05-03 11:09:59 heikki Exp $
+ * $Id: nfa.h,v 1.3 2006-05-03 13:47:35 heikki Exp $
*/
/**
yaz_nfa *n /** The NFA itself */,
yaz_nfa_state *s /** The state whose result you want */);
-/** \brief Set the backref number to a state.
+/** \brief Set a backref point to a state.
*
- * Each state can be the beginning and/or ending of a backref
- * sequence. This call sets those flags in the states. After matching,
- * we can get hold of the backrefs that matched, and use them in our
- * translations. The backrefs start at 1, not zero!
+ * Each state can be the beginning and/or ending point of a backref
+ * sequence. This call sets one of those flags in the state. After
+ * matching, we can get hold of the backrefs that matched, and use
+ * them in our translations. The numbering of backrefs start at 1,
+ * not zero!
*
* \param n the nfa
* \param s the state to add to
*
*/
-int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s,
+int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
int backref_number,
int is_start );
-/** \brief Get the backref number of a state.
+/** \brief Get the backref point of a state
*
* \param n the nfa
* \param s the state to add to
* \return the backref number associated with the state, or 0 if none.
*/
-int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s,
+int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
int is_start );
/** \brief Add a transition to the NFA.
#define YAZ_NFA_OVERRUN 2
#define YAZ_NFA_LOOP 3
+/** \brief Get a back reference after a successfull match.
+ *
+ * \param n the nfa
+ * \param backref_no the number of the backref to get
+ * \param begin beginning of the matching substring
+ * \param end end of the matching substring
+ *
+ * Returns pointers to the beginning and end of a backref, or null
+ * pointers if one endpoint not met. Those pointers point to the
+ * original buffer that was matched, so the caller will not have to
+ * worry about freeing anything special.
+ *
+ * It is technically possible to create NFAs that meet the start but
+ * not the end of a backref. It is up to the caller to decide how
+ * to handle such a situation.
+ *
+ * \retval 0 OK
+ * \retval 1 no match
+ * \retval 2 no such backref
+ */
+
+int yaz_nfa_get_backref( yaz_nfa *n,
+ int backref_no,
+ yaz_nfa_char **start,
+ yaz_nfa_char **end );
/** \brief Get the first state of the NFA.
*
/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: nfa.c,v 1.3 2006-05-03 11:36:14 mike Exp $
+ * $Id: nfa.c,v 1.4 2006-05-03 13:47:35 heikki Exp $
*
* This is a simple NFA-based system for character set normalizing
* in yaz and zebra. Unlike standard NFA's, this operates on ranges of
yaz_nfa_state *firststate; /* the initial state */
yaz_backref_info *curr_backrefs; /* the backrefs we are matching */
yaz_backref_info *best_backrefs; /* backrefs of the best match so far*/
+ int lastmatch; /* the result of last match */
};
struct yaz_nfa_state {
n->nbackrefs=0;
n->curr_backrefs=0;
n->best_backrefs=0;
+ n->lastmatch= YAZ_NFA_NOMATCH;
return n;
}
return s->result;
}
-int yaz_nfa_set_backref(yaz_nfa *n, yaz_nfa_state *s,
+int yaz_nfa_set_backref_point(yaz_nfa *n, yaz_nfa_state *s,
int backref_number,
int is_start ){
if (is_start) {
if (s->backref_start)
return 1;
s->backref_start=backref_number;
- if (n->nbackrefs<backref_number) {
- n->nbackrefs=backref_number;
+ if (n->nbackrefs<=backref_number) {
+ n->nbackrefs=backref_number+1;
n->curr_backrefs=0;
n->best_backrefs=0;
/* clear them just in case we have already matched on */
return 0; /* ok */
}
-int yaz_nfa_get_backref(yaz_nfa *n, yaz_nfa_state *s,
+int yaz_nfa_get_backref_point(yaz_nfa *n, yaz_nfa_state *s,
int is_start ) {
if (!s)
return 0;
t=t->next;
if ( (( t->range_start <= *inchar ) && ( t->range_end >= *inchar )) ){
m->empties=0;
+ if (t->range_start!=t->range_end){
+ /* backref 0 is special: the last range operation */
+ m->n->curr_backrefs[0].start=inchar;
+ m->n->curr_backrefs[0].end=inchar;
+ }
match_state(t->to_state, inchar+1,incharsleft-1,m);
/* yes, descent to all matching nodes, even if overrun, */
/* since we can find a better one later */
m->bestnode=s->num;
m->result=s->result;
if (m->n->curr_backrefs)
- for (i=0; i<m->n->nbackrefs;i++)
+ for (i=0; i<m->n->nbackrefs;i++) {
m->n->best_backrefs[i]=m->n->curr_backrefs[i];
+ }
}
}
if (s->backref_start)
m->n->curr_backrefs[s->backref_start].start=0;
if (s->backref_end)
m->n->curr_backrefs[s->backref_end].end=0;
+ m->n->curr_backrefs[0].start=0;
+ m->n->curr_backrefs[0].end=0;
} /* match_state */
int yaz_nfa_match(yaz_nfa *n,
size_t incharsleft,
void **result ){
struct matcher m;
- if (!n->firststate)
- return YAZ_NFA_NOMATCH;
+ int sz;
+ int i;
+ if (!n->firststate) {
+ n->lastmatch=YAZ_NFA_NOMATCH;
+ return n->lastmatch;
+ }
m.n=n;
m.longest=*inbuff;
m.bestnode=n->nstates;
m.result=0;
m.errorcode=0;
- if ((!n->curr_backrefs) && (n->nbackrefs)){
- int sz=sizeof( struct yaz_nfa_backref_info) * (n->nbackrefs+1);
+ sz=sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
+ if (!n->curr_backrefs) {
n->curr_backrefs=nmem_malloc( n->nmem, sz);
n->best_backrefs=nmem_malloc( n->nmem, sz);
}
-
+ for (i=0;i<n->nbackrefs;i++) {
+ n->curr_backrefs[i].start=0;
+ n->curr_backrefs[i].end=0;
+ n->best_backrefs[i].start=0;
+ n->best_backrefs[i].end=0;
+ }
match_state(n->firststate, *inbuff, incharsleft, &m);
if (m.result) {
*result=m.result;
*inbuff=m.longest;
if (m.errorcode)
- return m.errorcode;
+ n->lastmatch=m.errorcode;
else
- return YAZ_NFA_SUCCESS;
+ n->lastmatch= YAZ_NFA_SUCCESS;
+ return n->lastmatch;
}
- return YAZ_NFA_NOMATCH;
+ n->lastmatch=YAZ_NFA_NOMATCH;
+ return n->lastmatch;
}
+int yaz_nfa_get_backref( yaz_nfa *n,
+ int backref_no,
+ yaz_nfa_char **start,
+ yaz_nfa_char **end) {
+ if (backref_no>=n->nbackrefs)
+ return 2;
+ if (backref_no<0)
+ return 2;
+ if (n->lastmatch== YAZ_NFA_NOMATCH)
+ return 1; /* accept other errors, they return partial matches*/
+
+ *start=n->best_backrefs[backref_no].start;
+ *end=n->best_backrefs[backref_no].end;
+ return 0;
+}
/* * * * * * * * *
yaz_nfa_state *s;
if (!F) /* lazy programmers can just pass 0 for F */
F=stdout;
- fprintf(F,"The NFA has %d states \n",n->nstates);
+ fprintf(F,"The NFA has %d states and %d backrefs\n",
+ n->nstates, n->nbackrefs);
s=n->laststate;
if (s) {
do {
/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: nfatest1.c,v 1.1 2006-05-03 09:04:33 heikki Exp $
+ * $Id: nfatest1.c,v 1.2 2006-05-03 13:47:35 heikki Exp $
*
*/
yaz_nfa_char *buf, int buflen,
int expcode, char *expstr) {
yaz_nfa_char *c=buf;
+ yaz_nfa_char *cp1,*cp2;
void *resptr=0;
- int i;
+ int i,bi;
i=yaz_nfa_match(n,&c,buflen,&resptr);
#if VERBOSE
printf("\n'%s' returned %d. Moved c by %d, and resulted in '%s'\n",
expstr, i, (c-buf),(char*)resptr);
#endif
YAZ_CHECK_EQ(i,expcode);
- YAZ_CHECK_EQ(strcmp(expstr,(char*)resptr),0);
+ if (i!=1)
+ YAZ_CHECK_EQ(strcmp(expstr,(char*)resptr),0);
+ i=0;
+ bi=0;
+ while(bi!=2){
+ bi=yaz_nfa_get_backref(n,i,&cp1,&cp2);
+ if (bi==0 && ( cp1 || cp2 ) ) {
+#if VERBOSE
+ printf(" got backref %d of %d chars (%p to %p): '",
+ i, cp2-cp1+1, cp1, cp2);
+ while (cp2-cp1 >= 0 )
+ printf("%c", *cp1++);
+ printf("'\n");
+#endif
+ }
+ i++;
+ }
}
void construction_test() {
yaz_nfa* n= yaz_nfa_init();
+ yaz_nfa_char *cp, *cp1, *cp2;
yaz_nfa_state *s,*s0,*s1,*s2,*s3,*s4,*s5;
int i;
yaz_nfa_char seq1[]={'p','r','e','f','i','x',0};
yaz_nfa_char seq2[]={'p','r','e','l','i','m',0};
- yaz_nfa_char tst1[]={'c','0'};
- yaz_nfa_char tst2[]={'c','k','0'};
- yaz_nfa_char tst3[]={'c','x','0'};
- yaz_nfa_char tst4[]={'z','k','0'};
- yaz_nfa_char tst5[]={'y','k','k','k','k','k','k','y','0'};
+ yaz_nfa_char tst1[]={'c',0};
+ yaz_nfa_char tst2[]={'c','k',0};
+ yaz_nfa_char tst3[]={'c','x',0};
+ yaz_nfa_char tst4[]={'z','k',0};
+ yaz_nfa_char tst5[]={'y','k','l','k','k','l','k','d',0};
+ yaz_nfa_char tst6[]={'x','z','k','a','b',0};
void *p;
YAZ_CHECK(n);
yaz_nfa_set_result(n,s,"loop");
s=yaz_nfa_add_range(n, 0, 'y','y' );
+ yaz_nfa_set_backref_point(n,s,1,1);
s1=yaz_nfa_add_state(n);
- yaz_nfa_set_backref(n,s1,1,1);
yaz_nfa_add_empty_transition(n,s,s1);
s=s1;
- yaz_nfa_add_transition(n,s,s,'k','k');
- s=yaz_nfa_add_range(n, s, 'y','y' );
- yaz_nfa_set_result(n,s,"y k+ y");
- yaz_nfa_set_backref(n,s,1,0);
+ yaz_nfa_add_transition(n,s,s,'k','l');
+ s=yaz_nfa_add_range(n, s, 'd','d' );
+ yaz_nfa_set_result(n,s,"y k+ d");
+ yaz_nfa_set_backref_point(n,s,1,0);
s=yaz_nfa_add_sequence(n, 0, seq1 );
yaz_nfa_set_result(n,s,"PREFIX");
s=yaz_nfa_add_sequence(n, 0, seq2 );
yaz_nfa_set_result(n,s,"PRELIM");
+ s=yaz_nfa_add_range(n, 0, 'x','x' );
+ yaz_nfa_set_backref_point(n,s,2,1);
+ s1=yaz_nfa_add_sequence(n,s,tst4);
+ yaz_nfa_set_backref_point(n,s1,2,0);
+ yaz_nfa_set_result(n,s1,"xzk");
+
+ /* check return codes before doing any matches */
+ i=yaz_nfa_get_backref(n, 0, &cp1, &cp2 );
+ YAZ_CHECK_EQ(i,1);
+ i=yaz_nfa_get_backref(n, 3, &cp1, &cp2 );
+ YAZ_CHECK_EQ(i,2);
+ i=yaz_nfa_get_backref(n, 1, &cp1, &cp2 );
+ YAZ_CHECK_EQ(i,1);
+
+
#if VERBOSE
yaz_nfa_dump(0,n, printfunc);
#endif
test_match(n,tst2,3,YAZ_NFA_SUCCESS,"first");
test_match(n,tst3,3,YAZ_NFA_SUCCESS,"a-k,x-z");
test_match(n,tst4,9,YAZ_NFA_LOOP,"loop");
- test_match(n,tst5,9,YAZ_NFA_SUCCESS,"y k+ y");
+ test_match(n,tst5,9,YAZ_NFA_SUCCESS,"y k+ d");
+
+ cp=tst6;
+ i=yaz_nfa_match(n,&cp,8,&p);
+ YAZ_CHECK_EQ(i,YAZ_NFA_SUCCESS);
+ i=yaz_nfa_get_backref(n, 2, &cp1, &cp2 );
+ YAZ_CHECK_EQ(i,0);
+#if VERBOSE
+ printf("backref from %p to %p is %d long\n",
+ cp1,cp2, cp2-cp1 );
+#endif
yaz_nfa_destroy(n);
}