From: Adam Dickmeiss Date: Thu, 3 Apr 2008 15:19:55 +0000 (+0200) Subject: Refactor iconv decoders. X-Git-Tag: v3.0.30~57 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=96c6e58f286787106e4a7b3bb3900a36051968d6;p=yaz-moved-to-github.git Refactor iconv decoders. --- diff --git a/src/Makefile.am b/src/Makefile.am index 23a5bfa..dece367 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -96,7 +96,8 @@ libyaz_la_SOURCES=version.c options.c log.c \ mime.c mime.h oid_util.c tokenizer.c \ record_conv.c retrieval.c elementset.c snprintf.c query-charset.c \ copy_types.c match_glob.c poll.c daemon.c \ - iconv_encode_marc8.c iconv_encode_iso_8859_1.c iconv_encode_wchar.c + iconv_encode_marc8.c iconv_encode_iso_8859_1.c iconv_encode_wchar.c \ + iconv_decode_marc8.c libyaz_la_LDFLAGS=-version-info $(YAZ_VERSION_INFO) diff --git a/src/advancegreek.c b/src/advancegreek.c index 7792073..5a01489 100644 --- a/src/advancegreek.c +++ b/src/advancegreek.c @@ -18,8 +18,9 @@ #include "iconv-p.h" -unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) +static unsigned long read_advancegreek(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) { unsigned long x = 0; int shift = 0; @@ -384,6 +385,17 @@ yaz_iconv_encoder_t yaz_advancegreek_encoder(const char *name, return 0; } +yaz_iconv_decoder_t yaz_advancegreek_decoder(const char *name, + yaz_iconv_decoder_t d) +{ + if (!yaz_matchstr(name, "advancegreek")) + { + d->read_handle = read_advancegreek; + return d; + } + return 0; +} + /* * Local variables: * c-basic-offset: 4 diff --git a/src/iconv-p.h b/src/iconv-p.h index 1b199fd..f78f2ac 100644 --- a/src/iconv-p.h +++ b/src/iconv-p.h @@ -26,7 +26,7 @@ */ /** * \file - * \brief Internal header for conv + * \brief Internal header for iconv */ #ifndef ICONV_P_H @@ -38,22 +38,6 @@ void yaz_iconv_set_errno(yaz_iconv_t cd, int no); -unsigned long yaz_read_iso5428_1984(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read); - -size_t yaz_init_UTF8(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read); -unsigned long yaz_read_UTF8(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read); - - -unsigned long yaz_read_UCS4(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read); -unsigned long yaz_read_UCS4LE(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read); -unsigned long yaz_read_advancegreek(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read); - typedef struct yaz_iconv_encoder_s *yaz_iconv_encoder_t; struct yaz_iconv_encoder_s { void *data; @@ -90,6 +74,33 @@ int yaz_iso_8859_1_lookup_y(unsigned long v, int yaz_iso_8859_1_lookup_x12(unsigned long x1, unsigned long x2, unsigned long *y); +typedef struct yaz_iconv_decoder_s *yaz_iconv_decoder_t; +struct yaz_iconv_decoder_s { + void *data; + size_t (*init_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inbuf, + size_t inbytesleft, size_t *no_read); + unsigned long (*read_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inbuf, + size_t inbytesleft, size_t *no_read); + void (*destroy_handle)(yaz_iconv_decoder_t d); +}; + +yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode, + yaz_iconv_decoder_t d); +yaz_iconv_decoder_t yaz_utf8_decoder(const char *fromcode, + yaz_iconv_decoder_t d); +yaz_iconv_decoder_t yaz_ucs4_decoder(const char *tocode, + yaz_iconv_decoder_t d); +yaz_iconv_decoder_t yaz_iso_8859_1_decoder(const char *fromcode, + yaz_iconv_decoder_t d); +yaz_iconv_decoder_t yaz_iso_5428_decoder(const char *name, + yaz_iconv_decoder_t d); +yaz_iconv_decoder_t yaz_advancegreek_decoder(const char *name, + yaz_iconv_decoder_t d); +yaz_iconv_decoder_t yaz_wchar_decoder(const char *fromcode, + yaz_iconv_decoder_t d); + #endif /* * Local variables: diff --git a/src/iconv_decode_marc8.c b/src/iconv_decode_marc8.c new file mode 100644 index 0000000..7b890ad --- /dev/null +++ b/src/iconv_decode_marc8.c @@ -0,0 +1,284 @@ +/* This file is part of the YAZ toolkit. + * Copyright (C) 1995-2008 Index Data + * See the file LICENSE for details. + */ +/** + * \file + * \brief MARC-8 decoding + * + * MARC-8 reference: + * http://www.loc.gov/marc/specifications/speccharmarc8.html + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include + +#include +#include "iconv-p.h" + +struct decoder_data { + int g0_mode; + int g1_mode; + + int comb_offset; + int comb_size; + unsigned long comb_x[8]; + size_t comb_no_read[8]; +}; + +yaz_conv_func_t yaz_marc8_42_conv; +yaz_conv_func_t yaz_marc8_45_conv; +yaz_conv_func_t yaz_marc8_67_conv; +yaz_conv_func_t yaz_marc8_62_conv; +yaz_conv_func_t yaz_marc8_70_conv; +yaz_conv_func_t yaz_marc8_32_conv; +yaz_conv_func_t yaz_marc8_4E_conv; +yaz_conv_func_t yaz_marc8_51_conv; +yaz_conv_func_t yaz_marc8_33_conv; +yaz_conv_func_t yaz_marc8_34_conv; +yaz_conv_func_t yaz_marc8_53_conv; +yaz_conv_func_t yaz_marc8_31_conv; + + +static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, + struct decoder_data *data, + unsigned char *inp, + size_t inbytesleft, size_t *no_read, + int *comb); + +static unsigned long read_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + struct decoder_data *data = d->data; + unsigned long x; + if (data->comb_offset < data->comb_size) + { + *no_read = data->comb_no_read[data->comb_offset]; + x = data->comb_x[data->comb_offset]; + + /* special case for double-diacritic combining characters, + INVERTED BREVE and DOUBLE TILDE. + We'll increment the no_read counter by 1, since we want to skip over + the processing of the closing ligature character + */ + /* this code is no longer necessary.. our handlers code in + yaz_marc8_?_conv (generated by charconv.tcl) now returns + 0 and no_read=1 when a sequence does not match the input. + The SECOND HALFs in codetables.xml produces a non-existant + entry in the conversion trie.. Hence when met, the input byte is + skipped as it should (in yaz_iconv) + */ +#if 0 + if (x == 0x0361 || x == 0x0360) + *no_read += 1; +#endif + data->comb_offset++; + return x; + } + + data->comb_offset = 0; + for (data->comb_size = 0; data->comb_size < 8; data->comb_size++) + { + int comb = 0; + + if (inbytesleft == 0 && data->comb_size) + { + yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); + x = 0; + *no_read = 0; + break; + } + x = yaz_read_marc8_comb(cd, data, inp, inbytesleft, no_read, &comb); + if (!comb || !x) + break; + data->comb_x[data->comb_size] = x; + data->comb_no_read[data->comb_size] = *no_read; + inp += *no_read; + inbytesleft = inbytesleft - *no_read; + } + return x; +} + +static unsigned long read_marc8s(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + struct decoder_data *data = d->data; + unsigned long x = read_marc8(cd, d, inp, inbytesleft, no_read); + if (x && data->comb_size == 1) + { + if (yaz_iso_8859_1_lookup_x12(x, data->comb_x[0], &x)) + { + *no_read += data->comb_no_read[0]; + data->comb_size = 0; + } + } + return x; +} + +static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, + struct decoder_data *data, + unsigned char *inp, + size_t inbytesleft, size_t *no_read, + int *comb) +{ + *no_read = 0; + while (inbytesleft > 0 && *inp == 27) + { + int *modep = &data->g0_mode; + size_t inbytesleft0 = inbytesleft; + + inbytesleft--; + inp++; + if (inbytesleft == 0) + goto incomplete; + if (*inp == '$') /* set with multiple bytes */ + { + inbytesleft--; + inp++; + } + if (inbytesleft == 0) + goto incomplete; + if (*inp == '(' || *inp == ',') /* G0 */ + { + inbytesleft--; + inp++; + } + else if (*inp == ')' || *inp == '-') /* G1 */ + { + inbytesleft--; + inp++; + modep = &data->g1_mode; + } + if (inbytesleft == 0) + goto incomplete; + if (*inp == '!') /* ANSEL is a special case */ + { + inbytesleft--; + inp++; + } + if (inbytesleft == 0) + goto incomplete; + *modep = *inp++; /* Final character */ + inbytesleft--; + + (*no_read) += inbytesleft0 - inbytesleft; + } + if (inbytesleft == 0) + return 0; + else if (*inp == ' ') + { + *no_read += 1; + return ' '; + } + else + { + unsigned long x; + size_t no_read_sub = 0; + int mode = *inp < 128 ? data->g0_mode : data->g1_mode; + *comb = 0; + + switch(mode) + { + case 'B': /* Basic ASCII */ + case 's': /* ASCII */ + x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'E': /* ANSEL */ + x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128); + break; + case 'g': /* Greek */ + x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'b': /* Subscripts */ + x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'p': /* Superscripts */ + x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '2': /* Basic Hebrew */ + x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'N': /* Basic Cyrillic */ + x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'Q': /* Extended Cyrillic */ + x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '3': /* Basic Arabic */ + x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '4': /* Extended Arabic */ + x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case 'S': /* Greek */ + x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + case '1': /* Chinese, Japanese, Korean (EACC) */ + x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); + break; + default: + *no_read = 0; + yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ); + return 0; + } + *no_read += no_read_sub; + return x; + } +incomplete: + *no_read = 0; + yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); + return 0; +} + + +static size_t init_marc8(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + struct decoder_data *data = d->data; + data->g0_mode = 'B'; + data->g1_mode = 'E'; + data->comb_offset = data->comb_size = 0; + return 0; +} + +void destroy_marc8(yaz_iconv_decoder_t d) +{ + struct decoder_data *data = d->data; + xfree(data); +} + +yaz_iconv_decoder_t yaz_marc8_decoder(const char *fromcode, + yaz_iconv_decoder_t d) +{ + if (!yaz_matchstr(fromcode, "MARC8")) + d->read_handle = read_marc8; + else if (!yaz_matchstr(fromcode, "MARC8s")) + d->read_handle = read_marc8s; + else + return 0; + { + struct decoder_data *data = xmalloc(sizeof(*data)); + d->data = data; + d->init_handle = init_marc8; + d->destroy_handle = destroy_marc8; + } + return d; +} + + +/* + * Local variables: + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + * vim: shiftwidth=4 tabstop=8 expandtab + */ diff --git a/src/iconv_encode_iso_8859_1.c b/src/iconv_encode_iso_8859_1.c index b2d11ee..f0a6a5e 100644 --- a/src/iconv_encode_iso_8859_1.c +++ b/src/iconv_encode_iso_8859_1.c @@ -17,12 +17,7 @@ #include #include -#if HAVE_ICONV_H -#include -#endif - #include -#include #include "iconv-p.h" struct encoder_data @@ -244,6 +239,28 @@ yaz_iconv_encoder_t yaz_iso_8859_1_encoder(const char *tocode, return 0; } +static unsigned long read_ISO8859_1(yaz_iconv_t cd, + yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + unsigned long x = inp[0]; + *no_read = 1; + return x; +} + +yaz_iconv_decoder_t yaz_iso_8859_1_decoder(const char *fromcode, + yaz_iconv_decoder_t d) + +{ + if (!yaz_matchstr(fromcode, "iso88591")) + { + d->read_handle = read_ISO8859_1; + return d; + } + return 0; +} + /* * Local variables: diff --git a/src/iconv_encode_marc8.c b/src/iconv_encode_marc8.c index a07bd2a..055aa0e 100644 --- a/src/iconv_encode_marc8.c +++ b/src/iconv_encode_marc8.c @@ -20,7 +20,6 @@ #include #include -#include #include #include "iconv-p.h" diff --git a/src/iconv_encode_wchar.c b/src/iconv_encode_wchar.c index cf0a5f6..a09b530 100644 --- a/src/iconv_encode_wchar.c +++ b/src/iconv_encode_wchar.c @@ -20,8 +20,6 @@ #endif #include -#include -#include #include "iconv-p.h" struct encoder_data @@ -67,6 +65,43 @@ yaz_iconv_encoder_t yaz_wchar_encoder(const char *tocode, return 0; } +#if HAVE_WCHAR_H +static unsigned long read_wchar_t(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + unsigned long x = 0; + + if (inbytesleft < sizeof(wchar_t)) + { + yaz_iconv_set_errno(cd, YAZ_ICONV_EINVAL); /* incomplete input */ + *no_read = 0; + } + else + { + wchar_t wch; + memcpy(&wch, inp, sizeof(wch)); + x = wch; + *no_read = sizeof(wch); + } + return x; +} +#endif + +yaz_iconv_decoder_t yaz_wchar_decoder(const char *fromcode, + yaz_iconv_decoder_t d) + +{ +#if HAVE_WCHAR_H + if (!yaz_matchstr(fromcode, "wchar_t")) + { + d->read_handle = read_wchar_t; + return d; + } +#endif + return 0; +} + /* * Local variables: diff --git a/src/iso5428.c b/src/iso5428.c index 752f9c8..90cc782 100644 --- a/src/iso5428.c +++ b/src/iso5428.c @@ -18,8 +18,9 @@ #include "iconv-p.h" -unsigned long yaz_read_iso5428_1984(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) +static unsigned long read_iso_5428_1984(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) { unsigned long x = 0; int tonos = 0; @@ -376,6 +377,19 @@ yaz_iconv_encoder_t yaz_iso_5428_encoder(const char *name, return 0; } +yaz_iconv_decoder_t yaz_iso_5428_decoder(const char *name, + yaz_iconv_decoder_t d) +{ + if (!yaz_matchstr(name, "iso54281984") + || !yaz_matchstr(name, "iso5428:1984")) + { + d->read_handle = read_iso_5428_1984; + return d; + } + return 0; +} + + /* * Local variables: diff --git a/src/siconv.c b/src/siconv.c index b6bcf0e..36dc4b2 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -21,9 +21,6 @@ #include #include #include -#if HAVE_WCHAR_H -#include -#endif #if HAVE_ICONV_H #include @@ -33,262 +30,28 @@ #include #include "iconv-p.h" -yaz_conv_func_t yaz_marc8_42_conv; -yaz_conv_func_t yaz_marc8_45_conv; -yaz_conv_func_t yaz_marc8_67_conv; -yaz_conv_func_t yaz_marc8_62_conv; -yaz_conv_func_t yaz_marc8_70_conv; -yaz_conv_func_t yaz_marc8_32_conv; -yaz_conv_func_t yaz_marc8_4E_conv; -yaz_conv_func_t yaz_marc8_51_conv; -yaz_conv_func_t yaz_marc8_33_conv; -yaz_conv_func_t yaz_marc8_34_conv; -yaz_conv_func_t yaz_marc8_53_conv; -yaz_conv_func_t yaz_marc8_31_conv; - struct yaz_iconv_struct { int my_errno; int init_flag; +#if 0 size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf, size_t inbytesleft, size_t *no_read); unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf, size_t inbytesleft, size_t *no_read); - int g0_mode; - int g1_mode; - - int comb_offset; - int comb_size; - unsigned long comb_x[8]; - size_t comb_no_read[8]; +#endif size_t no_read_x; unsigned long unget_x; #if HAVE_ICONV_H iconv_t iconv_cd; #endif struct yaz_iconv_encoder_s encoder; + struct yaz_iconv_decoder_s decoder; }; -static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) -{ - unsigned long x = inp[0]; - *no_read = 1; - return x; -} - -#if HAVE_WCHAR_H -static unsigned long yaz_read_wchar_t(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) -{ - unsigned long x = 0; - - if (inbytesleft < sizeof(wchar_t)) - { - cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */ - *no_read = 0; - } - else - { - wchar_t wch; - memcpy(&wch, inp, sizeof(wch)); - x = wch; - *no_read = sizeof(wch); - } - return x; -} -#endif - - -static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read, - int *comb); - -static unsigned long yaz_read_marc8(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) -{ - unsigned long x; - if (cd->comb_offset < cd->comb_size) - { - *no_read = cd->comb_no_read[cd->comb_offset]; - x = cd->comb_x[cd->comb_offset]; - - /* special case for double-diacritic combining characters, - INVERTED BREVE and DOUBLE TILDE. - We'll increment the no_read counter by 1, since we want to skip over - the processing of the closing ligature character - */ - /* this code is no longer necessary.. our handlers code in - yaz_marc8_?_conv (generated by charconv.tcl) now returns - 0 and no_read=1 when a sequence does not match the input. - The SECOND HALFs in codetables.xml produces a non-existant - entry in the conversion trie.. Hence when met, the input byte is - skipped as it should (in yaz_iconv) - */ -#if 0 - if (x == 0x0361 || x == 0x0360) - *no_read += 1; -#endif - cd->comb_offset++; - return x; - } - - cd->comb_offset = 0; - for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++) - { - int comb = 0; - - if (inbytesleft == 0 && cd->comb_size) - { - cd->my_errno = YAZ_ICONV_EINVAL; - x = 0; - *no_read = 0; - break; - } - x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb); - if (!comb || !x) - break; - cd->comb_x[cd->comb_size] = x; - cd->comb_no_read[cd->comb_size] = *no_read; - inp += *no_read; - inbytesleft = inbytesleft - *no_read; - } - return x; -} - -static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) -{ - unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read); - if (x && cd->comb_size == 1) - { - if (yaz_iso_8859_1_lookup_x12(x, cd->comb_x[0], &x)) - { - *no_read += cd->comb_no_read[0]; - cd->comb_size = 0; - } - } - return x; -} - -static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read, - int *comb) -{ - *no_read = 0; - while (inbytesleft > 0 && *inp == 27) - { - int *modep = &cd->g0_mode; - size_t inbytesleft0 = inbytesleft; - - inbytesleft--; - inp++; - if (inbytesleft == 0) - goto incomplete; - if (*inp == '$') /* set with multiple bytes */ - { - inbytesleft--; - inp++; - } - if (inbytesleft == 0) - goto incomplete; - if (*inp == '(' || *inp == ',') /* G0 */ - { - inbytesleft--; - inp++; - } - else if (*inp == ')' || *inp == '-') /* G1 */ - { - inbytesleft--; - inp++; - modep = &cd->g1_mode; - } - if (inbytesleft == 0) - goto incomplete; - if (*inp == '!') /* ANSEL is a special case */ - { - inbytesleft--; - inp++; - } - if (inbytesleft == 0) - goto incomplete; - *modep = *inp++; /* Final character */ - inbytesleft--; - - (*no_read) += inbytesleft0 - inbytesleft; - } - if (inbytesleft == 0) - return 0; - else if (*inp == ' ') - { - *no_read += 1; - return ' '; - } - else - { - unsigned long x; - size_t no_read_sub = 0; - int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode; - *comb = 0; - - switch(mode) - { - case 'B': /* Basic ASCII */ - case 's': /* ASCII */ - x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case 'E': /* ANSEL */ - x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128); - break; - case 'g': /* Greek */ - x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case 'b': /* Subscripts */ - x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case 'p': /* Superscripts */ - x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case '2': /* Basic Hebrew */ - x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case 'N': /* Basic Cyrillic */ - x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case 'Q': /* Extended Cyrillic */ - x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case '3': /* Basic Arabic */ - x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case '4': /* Extended Arabic */ - x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case 'S': /* Greek */ - x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - case '1': /* Chinese, Japanese, Korean (EACC) */ - x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0); - break; - default: - *no_read = 0; - cd->my_errno = YAZ_ICONV_EILSEQ; - return 0; - } - *no_read += no_read_sub; - return x; - } -incomplete: - *no_read = 0; - cd->my_errno = YAZ_ICONV_EINVAL; - return 0; -} - - - int yaz_iconv_isbuiltin(yaz_iconv_t cd) { - return cd->read_handle && cd->encoder.write_handle; + return cd->decoder.read_handle && cd->encoder.write_handle; } @@ -311,6 +74,25 @@ static int prepare_encoders(yaz_iconv_t cd, const char *tocode) return 0; } +static int prepare_decoders(yaz_iconv_t cd, const char *tocode) +{ + if (yaz_marc8_decoder(tocode, &cd->decoder)) + return 1; + if (yaz_utf8_decoder(tocode, &cd->decoder)) + return 1; + if (yaz_ucs4_decoder(tocode, &cd->decoder)) + return 1; + if (yaz_iso_8859_1_decoder(tocode, &cd->decoder)) + return 1; + if (yaz_iso_5428_decoder(tocode, &cd->decoder)) + return 1; + if (yaz_advancegreek_decoder(tocode, &cd->decoder)) + return 1; + if (yaz_wchar_decoder(tocode, &cd->decoder)) + return 1; + return 0; +} + yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode) { yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd)); @@ -321,8 +103,11 @@ yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode) cd->encoder.init_handle = 0; cd->encoder.destroy_handle = 0; - cd->read_handle = 0; - cd->init_handle = 0; + cd->decoder.data = 0; + cd->decoder.read_handle = 0; + cd->decoder.init_handle = 0; + cd->decoder.destroy_handle = 0; + cd->my_errno = YAZ_ICONV_UNKNOWN; /* a useful hack: if fromcode has leading @, @@ -331,37 +116,13 @@ yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode) fromcode++; else { - if (!yaz_matchstr(fromcode, "UTF8")) - { - cd->read_handle = yaz_read_UTF8; - cd->init_handle = yaz_init_UTF8; - } - else if (!yaz_matchstr(fromcode, "ISO88591")) - cd->read_handle = yaz_read_ISO8859_1; - else if (!yaz_matchstr(fromcode, "UCS4")) - cd->read_handle = yaz_read_UCS4; - else if (!yaz_matchstr(fromcode, "UCS4LE")) - cd->read_handle = yaz_read_UCS4LE; - else if (!yaz_matchstr(fromcode, "MARC8")) - cd->read_handle = yaz_read_marc8; - else if (!yaz_matchstr(fromcode, "MARC8s")) - cd->read_handle = yaz_read_marc8s; - else if (!yaz_matchstr(fromcode, "advancegreek")) - cd->read_handle = yaz_read_advancegreek; - else if (!yaz_matchstr(fromcode, "iso54281984")) - cd->read_handle = yaz_read_iso5428_1984; - else if (!yaz_matchstr(fromcode, "iso5428:1984")) - cd->read_handle = yaz_read_iso5428_1984; -#if HAVE_WCHAR_H - else if (!yaz_matchstr(fromcode, "WCHAR_T")) - cd->read_handle = yaz_read_wchar_t; -#endif prepare_encoders(cd, tocode); + prepare_decoders(cd, fromcode); } - if (cd->read_handle && cd->encoder.write_handle) + if (cd->decoder.read_handle && cd->encoder.write_handle) { #if HAVE_ICONV_H - cd->iconv_cd = 0; + cd->iconv_cd = (iconv_t) (-1); #endif ; } @@ -390,7 +151,7 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, size_t r = 0; #if HAVE_ICONV_H - if (cd->iconv_cd) + if (cd->iconv_cd != (iconv_t) (-1)) { size_t r = iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft); @@ -421,22 +182,21 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, if (cd->init_flag) { cd->my_errno = YAZ_ICONV_UNKNOWN; - cd->g0_mode = 'B'; - cd->g1_mode = 'E'; - cd->comb_offset = cd->comb_size = 0; - if (cd->encoder.init_handle) (*cd->encoder.init_handle)(&cd->encoder); cd->unget_x = 0; cd->no_read_x = 0; - if (cd->init_handle && inbuf && *inbuf) + if (cd->decoder.init_handle) { size_t no_read = 0; - size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf, - *inbytesleft, &no_read); + size_t r = (cd->decoder.init_handle)( + cd, &cd->decoder, + inbuf ? (unsigned char *) *inbuf : 0, + inbytesleft ? *inbytesleft : 0, + &no_read); if (r) { if (cd->my_errno == YAZ_ICONV_EINVAL) @@ -444,8 +204,10 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, cd->init_flag = 0; return r; } - *inbytesleft -= no_read; - *inbuf += no_read; + if (inbytesleft) + *inbytesleft -= no_read; + if (inbuf) + *inbuf += no_read; } } cd->init_flag = 0; @@ -483,8 +245,9 @@ size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, r = *inbuf - inbuf0; break; } - x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft, - &no_read); + x = (*cd->decoder.read_handle)( + cd, &cd->decoder, + (unsigned char *) *inbuf, *inbytesleft, &no_read); if (no_read == 0) { r = (size_t)(-1); @@ -522,11 +285,13 @@ int yaz_iconv_error(yaz_iconv_t cd) int yaz_iconv_close(yaz_iconv_t cd) { #if HAVE_ICONV_H - if (cd->iconv_cd) + if (cd->iconv_cd != (iconv_t) (-1)) iconv_close(cd->iconv_cd); #endif if (cd->encoder.destroy_handle) (*cd->encoder.destroy_handle)(&cd->encoder); + if (cd->decoder.destroy_handle) + (*cd->decoder.destroy_handle)(&cd->decoder); xfree(cd); return 0; } diff --git a/src/ucs4.c b/src/ucs4.c index d298224..3c28b98 100644 --- a/src/ucs4.c +++ b/src/ucs4.c @@ -18,8 +18,9 @@ #include "iconv-p.h" -unsigned long yaz_read_UCS4(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) +static unsigned long read_UCS4(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) { unsigned long x = 0; @@ -36,8 +37,9 @@ unsigned long yaz_read_UCS4(yaz_iconv_t cd, unsigned char *inp, return x; } -unsigned long yaz_read_UCS4LE(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) +static unsigned long read_UCS4LE(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) { unsigned long x = 0; @@ -112,6 +114,19 @@ yaz_iconv_encoder_t yaz_ucs4_encoder(const char *tocode, return e; } +yaz_iconv_decoder_t yaz_ucs4_decoder(const char *tocode, + yaz_iconv_decoder_t d) + +{ + if (!yaz_matchstr(tocode, "UCS4")) + d->read_handle = read_UCS4; + else if (!yaz_matchstr(tocode, "UCS4LE")) + d->read_handle = read_UCS4LE; + else + return 0; + return d; +} + /* diff --git a/src/utf8.c b/src/utf8.c index 42515ca..bf92cd3 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -18,8 +18,9 @@ #include "iconv-p.h" -size_t yaz_init_UTF8(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) +static size_t init_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) { if (inp[0] != 0xef) { @@ -135,8 +136,9 @@ unsigned long yaz_read_UTF8_char(unsigned char *inp, return x; } -unsigned long yaz_read_UTF8(yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read) +static unsigned long read_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d, + unsigned char *inp, + size_t inbytesleft, size_t *no_read) { int err = 0; int r = yaz_read_UTF8_char(inp, inbytesleft, no_read, &err); @@ -227,6 +229,18 @@ yaz_iconv_encoder_t yaz_utf8_encoder(const char *tocode, return 0; } +yaz_iconv_decoder_t yaz_utf8_decoder(const char *fromcode, + yaz_iconv_decoder_t d) +{ + if (!yaz_matchstr(fromcode, "UTF8")) + { + d->init_handle = init_utf8; + d->read_handle = read_utf8; + return d; + } + return 0; +} + /* * Local variables: diff --git a/win/makefile b/win/makefile index b8f9d8a..f9f40a4 100644 --- a/win/makefile +++ b/win/makefile @@ -487,6 +487,7 @@ MISC_OBJS= \ $(OBJDIR)\daemon.obj \ $(OBJDIR)\iconv_encode_iso_8859_1.obj \ $(OBJDIR)\iconv_encode_marc8.obj \ + $(OBJDIR)\iconv_decode_marc8.obj \ $(OBJDIR)\iconv_encode_wchar.obj Z3950_OBJS= \