From aa60dbcd982a4dbd52870120ef34b91a74d63274 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Fri, 27 Apr 2007 10:09:44 +0000 Subject: [PATCH] Separate tokenizer stuff into parsing and configuration types. --- include/yaz/tokenizer.h | 36 ++++++++----- src/cclqfile.c | 40 +++++++------- src/tokenizer.c | 134 ++++++++++++++++++++++++++++------------------- 3 files changed, 125 insertions(+), 85 deletions(-) diff --git a/include/yaz/tokenizer.h b/include/yaz/tokenizer.h index 02cd195..5d62dc8 100644 --- a/include/yaz/tokenizer.h +++ b/include/yaz/tokenizer.h @@ -24,7 +24,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* $Id: tokenizer.h,v 1.1 2007-04-26 21:45:16 adam Exp $ */ +/* $Id: tokenizer.h,v 1.2 2007-04-27 10:09:44 adam Exp $ */ /** \file tokenizer.h \brief Header with public definitions about YAZ' tokenizer @@ -36,35 +36,45 @@ YAZ_BEGIN_CDECL -#define YAZ_TOKENIZER_EOF 0 -#define YAZ_TOKENIZER_ERROR (-1) -#define YAZ_TOKENIZER_STRING (-2) -#define YAZ_TOKENIZER_QSTRING (-3) +#define YAZ_TOK_EOF 0 +#define YAZ_TOK_ERROR (-1) +#define YAZ_TOK_STRING (-2) +#define YAZ_TOK_QSTRING (-3) -typedef struct yaz_tokenizer *yaz_tokenizer_t; +typedef struct yaz_tok_cfg *yaz_tok_cfg_t; +typedef struct yaz_tok_parse *yaz_tok_parse_t; + +typedef int (*yaz_tok_get_byte_t)(void **vp); + +YAZ_EXPORT +yaz_tok_cfg_t yaz_tok_cfg_create(void); YAZ_EXPORT -yaz_tokenizer_t yaz_tokenizer_create(void); +void yaz_tok_cfg_destroy(yaz_tok_cfg_t t); YAZ_EXPORT -void yaz_tokenizer_destroy(yaz_tokenizer_t t); +void yaz_tok_cfg_single_tokens(yaz_tok_cfg_t t, const char *simple); YAZ_EXPORT -void yaz_tokenizer_read_buf(yaz_tokenizer_t t, const char *buf); +yaz_tok_parse_t yaz_tok_parse_buf(yaz_tok_cfg_t t, const char *buf); YAZ_EXPORT -int yaz_tokenizer_move(yaz_tokenizer_t t); +yaz_tok_parse_t yaz_tok_parse_create(yaz_tok_cfg_t t, yaz_tok_get_byte_t h, + void *vp); YAZ_EXPORT -const char *yaz_tokenizer_string(yaz_tokenizer_t t); +void yaz_tok_parse_destroy(yaz_tok_parse_t tp); YAZ_EXPORT -void yaz_tokenizer_single_tokens(yaz_tokenizer_t t, const char *simple); +int yaz_tok_move(yaz_tok_parse_t tp); + +YAZ_EXPORT +const char *yaz_tok_parse_string(yaz_tok_parse_t tp); YAZ_END_CDECL #endif -/* CQL_H_INCLUDED */ + /* * Local variables: * c-basic-offset: 4 diff --git a/src/cclqfile.c b/src/cclqfile.c index b3abfd8..5d56447 100644 --- a/src/cclqfile.c +++ b/src/cclqfile.c @@ -48,7 +48,7 @@ /* CCL qualifiers * Europagate, 1995 * - * $Id: cclqfile.c,v 1.9 2007-04-26 21:45:17 adam Exp $ + * $Id: cclqfile.c,v 1.10 2007-04-27 10:09:45 adam Exp $ * * Old Europagate Log: * @@ -78,7 +78,7 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, const char **addinfo) { - yaz_tokenizer_t yt = yaz_tokenizer_create(); + yaz_tok_cfg_t yt = yaz_tok_cfg_create(); int type_ar[MAX_QUAL]; int value_ar[MAX_QUAL]; @@ -87,34 +87,38 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, int pair_no = 0; char *type_str = 0; int t; + yaz_tok_parse_t tp; - yaz_tokenizer_single_tokens(yt, ",="); - yaz_tokenizer_read_buf(yt, cp); + yaz_tok_cfg_single_tokens(yt, ",="); + + tp = yaz_tok_parse_buf(yt, cp); + + yaz_tok_cfg_destroy(yt); *addinfo = 0; - t = yaz_tokenizer_move(yt); - while (t == YAZ_TOKENIZER_STRING) + t = yaz_tok_move(tp); + while (t == YAZ_TOK_STRING) { /* we don't know what lead is yet */ - char *lead_str = xstrdup(yaz_tokenizer_string(yt)); + char *lead_str = xstrdup(yaz_tok_parse_string(tp)); const char *value_str = 0; int type = 0, value = 0; /* indicates attribute value UNSET */ - t = yaz_tokenizer_move(yt); + t = yaz_tok_move(tp); if (t == ',') { /* full attribute spec: set, type = value */ /* lead is attribute set */ attsets[pair_no] = lead_str; - t = yaz_tokenizer_move(yt); - if (t != YAZ_TOKENIZER_STRING) + t = yaz_tok_move(tp); + if (t != YAZ_TOK_STRING) { *addinfo = "token expected"; goto out; } xfree(type_str); - type_str = xstrdup(yaz_tokenizer_string(yt)); - if (yaz_tokenizer_move(yt) != '=') + type_str = xstrdup(yaz_tok_parse_string(tp)); + if (yaz_tok_move(tp) != '=') { *addinfo = "= expected"; goto out; @@ -133,20 +137,20 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, /* lead is first of a list of qualifier aliaeses */ /* qualifier alias: q1 q2 ... */ xfree(lead_str); - yaz_tokenizer_destroy(yt); + yaz_tok_parse_destroy(tp); ccl_qual_add_combi (bibset, qual_name, cp); return 0; } while (1) /* comma separated attribute value list */ { - t = yaz_tokenizer_move(yt); + t = yaz_tok_move(tp); /* must have a value now */ - if (t != YAZ_TOKENIZER_STRING) + if (t != YAZ_TOK_STRING) { *addinfo = "value token expected"; goto out; } - value_str = yaz_tokenizer_string(yt); + value_str = yaz_tok_parse_string(tp); if (sscanf(type_str, "%d", &type) == 1) ; @@ -231,7 +235,7 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, *addinfo = "too many attribute values"; goto out; } - t = yaz_tokenizer_move(yt); + t = yaz_tok_move(tp); if (t != ',') break; attsets[pair_no] = attsets[pair_no-1]; @@ -241,7 +245,7 @@ int ccl_qual_field2(CCL_bibset bibset, const char *cp, const char *qual_name, xfree(type_str); type_str = 0; - yaz_tokenizer_destroy(yt); + yaz_tok_parse_destroy(tp); if (*addinfo) { diff --git a/src/tokenizer.c b/src/tokenizer.c index 622e6ce..e13465f 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: tokenizer.c,v 1.1 2007-04-26 21:45:17 adam Exp $ + * $Id: tokenizer.c,v 1.2 2007-04-27 10:09:45 adam Exp $ */ /** @@ -18,51 +18,55 @@ #include #include -struct yaz_tokenizer { - int (*get_byte_func)(const void **vp); - const void *get_byte_data; - +struct yaz_tok_parse { int unget_byte; + WRBUF wr_string; + int look; + + yaz_tok_cfg_t cfg; + yaz_tok_get_byte_t get_byte_func; + void *get_byte_data; +}; + +struct yaz_tok_cfg { + int ref_count; char *white_space; char *single_tokens; char *quote_tokens_begin; char *quote_tokens_end; - WRBUF wr_string; - int look; }; -void yaz_tokenizer_single_tokens(yaz_tokenizer_t t, const char *simple) +void yaz_tok_cfg_single_tokens(yaz_tok_cfg_t t, const char *simple) { xfree(t->single_tokens); t->single_tokens = xstrdup(simple); } -yaz_tokenizer_t yaz_tokenizer_create(void) +yaz_tok_cfg_t yaz_tok_cfg_create(void) { - yaz_tokenizer_t t = xmalloc(sizeof(*t)); + yaz_tok_cfg_t t = xmalloc(sizeof(*t)); t->white_space = xstrdup(" \t\r\n"); t->single_tokens = xstrdup(""); t->quote_tokens_begin = xstrdup("\""); t->quote_tokens_end = xstrdup("\""); - t->get_byte_func = 0; - t->get_byte_data = 0; - t->wr_string = wrbuf_alloc(); - t->look = YAZ_TOKENIZER_ERROR; - t->unget_byte = 0; + t->ref_count = 1; return t; } -void yaz_tokenizer_destroy(yaz_tokenizer_t t) +void yaz_tok_cfg_destroy(yaz_tok_cfg_t t) { - xfree(t->white_space); - xfree(t->single_tokens); - xfree(t->quote_tokens_begin); - xfree(t->quote_tokens_end); - wrbuf_destroy(t->wr_string); - xfree(t); + t->ref_count--; + if (t->ref_count == 0) + { + xfree(t->white_space); + xfree(t->single_tokens); + xfree(t->quote_tokens_begin); + xfree(t->quote_tokens_end); + xfree(t); + } } -static int read_buf(const void **vp) +static int read_buf(void **vp) { const char *cp = *(const char **) vp; int ch = *cp; @@ -74,77 +78,99 @@ static int read_buf(const void **vp) return ch; } -static int get_byte(yaz_tokenizer_t t) +yaz_tok_parse_t yaz_tok_parse_buf(yaz_tok_cfg_t t, const char *buf) { - int ch = t->unget_byte; - assert(t->get_byte_func); + return yaz_tok_parse_create(t, read_buf, (void *) buf); +} + +static int get_byte(yaz_tok_parse_t tp) +{ + int ch = tp->unget_byte; + assert(tp->get_byte_func); if (ch) - t->unget_byte = 0; + tp->unget_byte = 0; else - ch = t->get_byte_func(&t->get_byte_data); + ch = tp->get_byte_func(&tp->get_byte_data); return ch; } -static void unget_byte(yaz_tokenizer_t t, int ch) +static void unget_byte(yaz_tok_parse_t tp, int ch) +{ + tp->unget_byte = ch; +} + +yaz_tok_parse_t yaz_tok_parse_create(yaz_tok_cfg_t t, + yaz_tok_get_byte_t h, + void *vp) { - t->unget_byte = ch; + yaz_tok_parse_t tp = xmalloc(sizeof(*tp)); + + tp->cfg = t; + tp->cfg->ref_count++; + tp->get_byte_func = h; + tp->get_byte_data = vp; + + tp->look = YAZ_TOK_ERROR; + tp->unget_byte = 0; + + tp->wr_string = wrbuf_alloc(); + return tp; } + -void yaz_tokenizer_read_buf(yaz_tokenizer_t t, const char *buf) +void yaz_tok_parse_destroy(yaz_tok_parse_t tp) { - assert(t); - t->get_byte_func = read_buf; - t->get_byte_data = buf; + yaz_tok_cfg_destroy(tp->cfg); + wrbuf_destroy(tp->wr_string); + xfree(tp); } -int yaz_tokenizer_move(yaz_tokenizer_t t) +int yaz_tok_move(yaz_tok_parse_t tp) { + yaz_tok_cfg_t t = tp->cfg; const char *cp; - int ch = get_byte(t); + int ch = get_byte(tp); /* skip white space */ while (ch && strchr(t->white_space, ch)) - ch = get_byte(t); + ch = get_byte(tp); if (!ch) { - ch = YAZ_TOKENIZER_EOF; + ch = YAZ_TOK_EOF; } else if ((cp = strchr(t->single_tokens, ch))) ch = *cp; /* single token match */ else if ((cp = strchr(t->quote_tokens_begin, ch))) { /* quoted string */ int end_ch = t->quote_tokens_end[cp - t->quote_tokens_begin]; - ch = get_byte(t); - wrbuf_rewind(t->wr_string); + ch = get_byte(tp); + wrbuf_rewind(tp->wr_string); while (ch && ch != end_ch) - wrbuf_putc(t->wr_string, ch); + wrbuf_putc(tp->wr_string, ch); if (!ch) - ch = YAZ_TOKENIZER_ERROR; + ch = YAZ_TOK_ERROR; else - ch = YAZ_TOKENIZER_QSTRING; + ch = YAZ_TOK_QSTRING; } else { /* unquoted string */ - wrbuf_rewind(t->wr_string); + wrbuf_rewind(tp->wr_string); while (ch && !strchr(t->white_space, ch) && !strchr(t->single_tokens, ch)) { - wrbuf_putc(t->wr_string, ch); - ch = get_byte(t); + wrbuf_putc(tp->wr_string, ch); + ch = get_byte(tp); } - unget_byte(t, ch); - ch = YAZ_TOKENIZER_STRING; + unget_byte(tp, ch); + ch = YAZ_TOK_STRING; } - t->look = ch; - yaz_log(YLOG_LOG, "tokenizer returns %d (%s)", ch, - wrbuf_cstr(t->wr_string)); - + tp->look = ch; return ch; } -const char *yaz_tokenizer_string(yaz_tokenizer_t t) +const char *yaz_tok_parse_string(yaz_tok_parse_t tp) { - return wrbuf_cstr(t->wr_string); + return wrbuf_cstr(tp->wr_string); } /* -- 1.7.10.4