/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
- * $Id: nfa.h,v 1.4 2006-05-04 18:58:24 adam Exp $
+ * $Id: nfa.h,v 1.5 2006-05-05 09:14:42 heikki Exp $
*/
/**
* possible sequence of input characters that match the ranges in the
* conditions, and that leads into a terminal state.
*
+ * Separate from this we have converters. Those can often be used
+ * together with a NFA (think match-pattern and replace-pattern).
+ *
+ * A converter is a routine that produces some output. It can translate a
+ * range of characters into another range, emit a constant string, or
+ * something like that.
+ *
*/
#ifndef YAZ_NFA_H
typedef struct yaz_nfa_transition yaz_nfa_transition;
+/** brief Simple character range converter */
+typedef struct yaz_nfa_converter yaz_nfa_converter;
+
+
+
/** \brief Initialize the NFA without any states in it
*
* \return a pointer to the newly created NFA
* Add a transition between two existing states. The condition
* is (as always) a range of yaz_nfa_chars.
* \param n the nfa
- * \param from_state which state the transition is from
+ * \param from_state which state the transition is from. null=initial
* \param to_state where the transition goes to
* \param range_start is the beginning of the range of values
* \param range_end is the end of the range of values
*
* \param n the nfa itself
* \param inbuff buffer of input data. Will be incremented when match
- * \param incharsleft max number of inchars to use from inbuff
+ * \param incharsleft max number of inchars to use from inbuff. decrements.
* \param result the result pointer from the nfa (what ever that is)
*
* In case of errors, returns the best match so far,
*
*/
-int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t incharsleft,
+int yaz_nfa_match(yaz_nfa *n, yaz_nfa_char **inbuff, size_t *incharsleft,
void **result );
/** yaz_nfa_match return codes */
yaz_nfa_char **start,
yaz_nfa_char **end );
+/** \brief Create a string converter.
+ * \param n the nfa
+ * \param string the string to output
+ * \param length how many chars in the string
+ *
+ * This converter produces a constant string in the output
+ */
+yaz_nfa_converter *yaz_nfa_create_string_converter (
+ yaz_nfa *n,
+ yaz_nfa_char *string,
+ size_t length );
+
+/** \brief Create a backref converter
+ * \param n the nfa
+ * \param backref_no The backreference to reproduce
+ *
+ * This converter copies a backref into the output buffer
+ */
+yaz_nfa_converter *yaz_nfa_create_backref_converter (
+ yaz_nfa *n, int backref_no );
+
+
+
+/** \brief Connects converters in a chain.
+ * \param n the nfa (mostly for nmem access)
+ * \param startpoint the first converter in the chain
+ * \param newconverter
+ *
+ * Places the new converter at the end of the chain that starts from
+ * startpoint.
+ *
+ */
+void yaz_nfa_append_converter (
+ yaz_nfa *n,
+ yaz_nfa_converter *startpoint,
+ yaz_nfa_converter *newconverter);
+
+/** brief Runs the chain of converters.
+ * \param n the nfa (mostly for nmem access)
+ * \param c the first converter in a chain
+ * \param outbuff buffer to write the output in. Increments the ptr.
+ * \param outcharsleft how many may we write
+ *
+ * Runs the converters in the chain, placing output into outbuff
+ * (and incrementing the pointer).
+ *
+ * \retval 0 OK
+ * \retval 1 no match to get backrefs from
+ * \retval 2 no room in outbuf
+ *
+ */
+int yaz_nfa_run_converters(
+ yaz_nfa *n,
+ yaz_nfa_converter *c,
+ yaz_nfa_char **outbuff,
+ size_t *outcharsleft);
+
+
/** \brief Get the first state of the NFA.
*
* \param n the nfa
void yaz_nfa_dump(FILE *F, yaz_nfa *n, char *(*strfunc)(void *) );
+
+
+
YAZ_END_CDECL
#endif
/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: nfa.c,v 1.5 2006-05-04 18:58:54 adam Exp $
+ * $Id: nfa.c,v 1.6 2006-05-05 09:14:42 heikki Exp $
*/
/**
#define LOOP_LIMIT 100
+typedef enum {
+ conv_none,
+ conv_string,
+ conv_backref
+} yaz_nfa_converter_type;
+
+struct yaz_nfa_converter {
+ yaz_nfa_converter *next;
+ yaz_nfa_converter_type type;
+ yaz_nfa_char *string;
+ size_t strlen;
+ int backref_no;
+ int char_diff;
+};
/* * * * * * *
yaz_nfa_char range_start,
yaz_nfa_char range_end) {
yaz_nfa_transition *t = nmem_malloc(n->nmem, sizeof(yaz_nfa_transition));
+ if (!from_state)
+ from_state = n->firststate;
t->range_start = range_start;
t->range_end = range_end;
t->to_state = to_state;
int yaz_nfa_match(yaz_nfa *n,
yaz_nfa_char **inbuff,
- size_t incharsleft,
+ size_t *incharsleft,
void **result ){
struct matcher m;
int sz;
m.bestnode = n->nstates;
m.result = 0;
m.errorcode = 0;
+ m.empties = 0;
sz = sizeof( struct yaz_nfa_backref_info) * n->nbackrefs;
if (!n->curr_backrefs) {
n->curr_backrefs = nmem_malloc( n->nmem, sz);
n->best_backrefs[i].end = 0;
}
- match_state(n->firststate, *inbuff, incharsleft, &m);
+ match_state(n->firststate, *inbuff, *incharsleft, &m);
if (m.result) {
+ *incharsleft -= (m.longest-*inbuff);
*result = m.result;
*inbuff = m.longest;
if (m.errorcode)
return 0;
}
+/* * * * * * * * * * * * * *
+ * Converters
+ * * * * * * * * * * * * * */
+
+static yaz_nfa_converter *create_null_converter ( yaz_nfa *n) {
+ yaz_nfa_converter *c;
+ c=nmem_malloc(n->nmem, sizeof(yaz_nfa_converter));
+ c->next=0;
+ c->type=conv_none;
+ c->string=0;
+ c->strlen=0;
+ c->backref_no=0;
+ c->char_diff=0;
+ return c;
+}
+
+yaz_nfa_converter *yaz_nfa_create_string_converter (
+ yaz_nfa *n,
+ yaz_nfa_char *string,
+ size_t length){
+ yaz_nfa_converter *c;
+ int i;
+ c=create_null_converter(n);
+ c->type=conv_string;
+ c->string=nmem_malloc(n->nmem, length*sizeof(yaz_nfa_char));
+ for (i=0;i<length;i++)
+ c->string[i]=string[i];
+ c->strlen=length;
+ return c;
+}
+
+yaz_nfa_converter *yaz_nfa_create_backref_converter (
+ yaz_nfa *n, int backref_no ) {
+ yaz_nfa_converter *c;
+ c=create_null_converter(n);
+ c->type=conv_backref;
+ c->backref_no=backref_no;
+ return c;
+}
+
+
+void yaz_nfa_append_converter (
+ yaz_nfa *n,
+ yaz_nfa_converter *startpoint,
+ yaz_nfa_converter *newconverter) {
+ while (startpoint->next)
+ startpoint=startpoint->next;
+ startpoint->next=newconverter;
+}
+
+static int string_convert (
+ yaz_nfa *n,
+ yaz_nfa_converter *c,
+ yaz_nfa_char **outbuff,
+ size_t *outcharsleft){
+ size_t sz=c->strlen;
+ yaz_nfa_char *p=c->string;
+ while (sz--) {
+ if ((*outcharsleft)-- <= 0)
+ return 2;
+ **outbuff=*p++;
+ (*outbuff)++;
+ }
+ return 0;
+}
+static int backref_convert (
+ yaz_nfa *n,
+ yaz_nfa_converter *c,
+ yaz_nfa_char **outbuff,
+ size_t *outcharsleft){
+ yaz_nfa_char *cp1,*cp2;
+ int i;
+ i=yaz_nfa_get_backref(n,c->backref_no, &cp1, &cp2);
+ if (i==2) /* no backref, produce no output, that's ok */
+ return 0;
+ if (i==1) /* no match in dfa */
+ return 1; /* should not happen */
+ while (cp2>cp1) {
+ if ((*outcharsleft)-- <= 0)
+ return 2;
+ **outbuff=*cp1++;
+ (*outbuff)++;
+ }
+ return 0;
+}
+
+
+int yaz_nfa_run_converters(
+ yaz_nfa *n,
+ yaz_nfa_converter *c,
+ yaz_nfa_char **outbuff,
+ size_t *outcharsleft){
+ int rc=0;
+ // yaz_nfa_char *bufstart=*outbuff;
+ while (c && !rc) {
+ switch(c->type) {
+ case conv_string:
+ rc=string_convert(n,c,outbuff,outcharsleft);
+ break;
+ case conv_backref:
+ rc=backref_convert(n,c,outbuff,outcharsleft);
+ break;
+ default:
+ rc=3; /* internal error */
+ }
+ c=c->next;
+ }
+ return rc;
+}
/* * * * * * * * *
* Debug routines
/* Copyright (C) 2006, Index Data ApS
* See the file LICENSE for details.
*
- * $Id: nfatest1.c,v 1.3 2006-05-04 18:59:13 adam Exp $
+ * $Id: nfatest1.c,v 1.4 2006-05-05 09:14:42 heikki Exp $
*
*/
return buf;
}
+char *printfunc2(void *result) {
+ static char buf[200];
+ sprintf(buf,"(%p)", result);
+ return buf;
+}
+
void test_match(yaz_nfa *n,
- yaz_nfa_char *buf, int buflen,
+ yaz_nfa_char *buf, size_t buflen,
int expcode, char *expstr) {
yaz_nfa_char *c = buf;
yaz_nfa_char *cp1, *cp2;
void *resptr = 0;
int i, bi;
- i = yaz_nfa_match(n,&c, buflen,&resptr);
+ size_t buflen2 = buflen;
+ i = yaz_nfa_match(n,&c, &buflen2,&resptr);
#if VERBOSE
printf("\n'%s' returned %d. Moved c by %d, and resulted in '%s'\n",
expstr, i, (c-buf),(char*)resptr);
#endif
+ YAZ_CHECK_EQ(buflen-buflen2, c-buf);
YAZ_CHECK_EQ(i, expcode);
if (i!=1)
YAZ_CHECK_EQ(strcmp(expstr,(char*)resptr), 0);
yaz_nfa_char tst5[]={'y', 'k', 'l', 'k', 'k', 'l', 'k', 'd', 0};
yaz_nfa_char tst6[]={'x', 'z', 'k', 'a', 'b', 0};
void *p;
+ size_t sz;
YAZ_CHECK(n);
test_match(n, tst5, 9, YAZ_NFA_SUCCESS, "y k+ d");
cp = tst6;
- i = yaz_nfa_match(n,&cp, 8,&p);
+ sz = 8;
+ i = yaz_nfa_match(n, &cp, &sz, &p);
YAZ_CHECK_EQ(i, YAZ_NFA_SUCCESS);
i = yaz_nfa_get_backref(n, 2, &cp1, &cp2 );
YAZ_CHECK_EQ(i, 0);
#if VERBOSE
- printf("backref from %p to %p is %d long\n",
- cp1, cp2, cp2-cp1 );
+ printf("backref from %p to %p is %d long. sz is now %d\n",
+ cp1,cp2, cp2-cp1,sz );
#endif
yaz_nfa_destroy(n);
}
+void converter_test() {
+ yaz_nfa* n= yaz_nfa_init();
+ yaz_nfa_converter *c1,*c2;
+ yaz_nfa_char str1[]={'a','b','c'};
+ yaz_nfa_char seq1[]={'A','B','C',0};
+ yaz_nfa_char seq2[]={'k','m','n','m','x','P','Q','X',0};
+ yaz_nfa_char outbuf[1024];
+ yaz_nfa_char *outp,*cp, *cp1, *cp2;
+ yaz_nfa_state *s, *s2;
+ void *vp;
+ int i;
+ size_t sz;
+
+ c1=yaz_nfa_create_string_converter(n,str1,3);
+
+ for(i=0;i<1024;i++)
+ outbuf[i]=10000+i;
+ outp=outbuf;
+ sz=1;
+ i=yaz_nfa_run_converters(n, c1, &outp, &sz);
+ YAZ_CHECK_EQ(i,2); /* overrun */
+ YAZ_CHECK_EQ(outbuf[0],'a');
+ YAZ_CHECK_EQ(outbuf[1],10000+1);
+
+ for(i=0;i<1024;i++)
+ outbuf[i]=10000+i;
+ outp=outbuf;
+ sz=3;
+ i=yaz_nfa_run_converters(n, c1, &outp, &sz);
+ YAZ_CHECK_EQ(i,0);
+ YAZ_CHECK_EQ(outbuf[0],'a');
+ YAZ_CHECK_EQ(outbuf[1],'b');
+ YAZ_CHECK_EQ(outbuf[2],'c');
+ YAZ_CHECK_EQ(outbuf[3],10000+3);
+ YAZ_CHECK_EQ(sz,0);
+
+ c2=yaz_nfa_create_string_converter(n,str1,2);
+ yaz_nfa_append_converter(n,c1,c2);
+
+ for(i=0;i<1024;i++)
+ outbuf[i]=10000+i;
+ outp=outbuf;
+ sz=10;
+ i=yaz_nfa_run_converters(n, c1, &outp, &sz);
+ YAZ_CHECK_EQ(i,0);
+ YAZ_CHECK_EQ(outbuf[0],'a');
+ YAZ_CHECK_EQ(outbuf[1],'b');
+ YAZ_CHECK_EQ(outbuf[2],'c');
+ YAZ_CHECK_EQ(outbuf[3],'a');
+ YAZ_CHECK_EQ(outbuf[4],'b');
+ YAZ_CHECK_EQ(outbuf[5],10000+5);
+ YAZ_CHECK_EQ(sz,5);
+
+ /* ABC -> abcab */
+ (void) yaz_nfa_add_state(n);/* start state */
+ s=yaz_nfa_add_state(n);
+ yaz_nfa_add_empty_transition(n,0,s);
+ yaz_nfa_set_backref_point(n,s,1,1);
+ s=yaz_nfa_add_sequence(n, s, seq1 );
+ yaz_nfa_set_result(n,s,c1);
+ yaz_nfa_set_backref_point(n,s,1,0);
+
+ /* [k-o][m-n]*x -> m-n sequence */
+ s=yaz_nfa_add_state(n);
+ yaz_nfa_add_empty_transition(n,0,s);
+ yaz_nfa_set_backref_point(n,s,2,1);
+ s2=yaz_nfa_add_state(n);
+ yaz_nfa_add_transition(n,s,s2,'k','o');
+ yaz_nfa_add_transition(n,s2,s2,'m','n');
+ s=yaz_nfa_add_state(n);
+ yaz_nfa_add_transition(n,s2,s,'x','x');
+ yaz_nfa_set_backref_point(n,s,2,0);
+
+ c1=yaz_nfa_create_backref_converter(n,2);
+ yaz_nfa_set_result(n,s,c1);
+
+#if VERBOSE
+ yaz_nfa_dump(0,n, printfunc2);
+#endif
+
+ cp=seq2;
+ sz=18;
+ i=yaz_nfa_match(n,&cp,&sz,&vp);
+ c2=vp;
+ YAZ_CHECK_EQ(i,YAZ_NFA_SUCCESS);
+ i=yaz_nfa_get_backref(n, 2, &cp1, &cp2 );
+ YAZ_CHECK_EQ(i,0);
+ YAZ_CHECK_EQ((int)c1,(int)c2);
+ for(i=0;i<1024;i++)
+ outbuf[i]=10000+i;
+ outp=outbuf;
+ sz=11;
+ i=yaz_nfa_run_converters(n, c2, &outp, &sz);
+ YAZ_CHECK_EQ(i,0);
+ YAZ_CHECK_EQ(outbuf[0],'k');
+ YAZ_CHECK_EQ(outbuf[1],'m');
+ YAZ_CHECK_EQ(outbuf[2],'n');
+ YAZ_CHECK_EQ(outbuf[3],'m');
+ YAZ_CHECK_EQ(outbuf[4],'x');
+ YAZ_CHECK_EQ(outbuf[5],10000+5);
+ YAZ_CHECK_EQ(sz,11-5);
+
+ yaz_nfa_destroy(n);
+}
+
+
int main(int argc, char **argv)
{
YAZ_CHECK_INIT(argc, argv);
nmem_init ();
- construction_test();
+ construction_test();
+ converter_test();
nmem_exit ();
YAZ_CHECK_TERM;
}