From 711f37334de1dde9fee4dfdee2e9263f42373494 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Mon, 24 Apr 2006 23:21:25 +0000 Subject: [PATCH] Added support for a new character set MARC8s identical to MARC8 except that it converts combined characters to single Unicode characters in the Latin-1 range (when possible). --- NEWS | 4 + src/siconv.c | 173 ++++++++++++--------- test/tsticonv.c | 449 ++++++++++++++++++++++++++++--------------------------- 3 files changed, 335 insertions(+), 291 deletions(-) diff --git a/NEWS b/NEWS index 1d43ba3..a7328c0 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Added support for a new character set MARC8s identical to MARC8 except +that it converts combined characters to single Unicode characters in the +Latin-1 range (when possible). + --- 2.1.18 2006/04/24 ASN.1 compiler 0.4: generates brief Doxygen file header for diff --git a/src/siconv.c b/src/siconv.c index d3e078f..bee184b 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2006, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.21 2006-04-19 23:48:06 adam Exp $ + * $Id: siconv.c,v 1.22 2006-04-24 23:21:26 adam Exp $ */ /** * \file siconv.c @@ -99,6 +99,77 @@ struct yaz_iconv_struct { const char *write_marc8_page_chr; }; +static struct { + unsigned long x1, x2; + unsigned y; +} latin1_comb[] = { + { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */ + { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */ + { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ + { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */ + { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ + { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ + /* no need for 0xc6 LATIN CAPITAL LETTER AE */ + { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */ + { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */ + { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */ + { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ + { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ + { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */ + { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */ + { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ + { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ + { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */ + { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */ + { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */ + { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ + { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */ + { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ + /* omitted: 0xd7 MULTIPLICATION SIGN */ + /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */ + { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */ + { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */ + { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ + { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ + { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */ + /* omitted: 0xde LATIN CAPITAL LETTER THORN */ + /* omitted: 0xdf LATIN SMALL LETTER SHARP S */ + { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */ + { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */ + { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ + { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */ + { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */ + { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */ + /* omitted: 0xe6 LATIN SMALL LETTER AE */ + { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */ + { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */ + { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */ + { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ + { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */ + { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */ + { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */ + { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ + { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */ + /* omitted: 0xf0 LATIN SMALL LETTER ETH */ + { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */ + { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */ + { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */ + { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ + { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */ + { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */ + /* omitted: 0xf7 DIVISION SIGN */ + /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */ + { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */ + { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */ + { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ + { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */ + { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */ + /* omitted: 0xfe LATIN SMALL LETTER THORN */ + { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */ + + { 0, 0, 0} +}; + static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp, size_t inbytesleft, size_t *no_read) { @@ -318,9 +389,29 @@ static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp, return x; } -static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp, - size_t inbytesleft, size_t *no_read, - int *comb) +static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read) +{ + unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read); + if (x && cd->comb_size == 1) + { + /* For MARC8s we try to get a Latin-1 page code out of it */ + int i; + for (i = 0; latin1_comb[i].x1; i++) + if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1) + { + *no_read += cd->comb_no_read[0]; + cd->comb_size = 0; + x = latin1_comb[i].y; + break; + } + } + return x; +} + +static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp, + size_t inbytesleft, size_t *no_read, + int *comb) { *no_read = 0; while(inbytesleft >= 1 && inp[0] == 27) @@ -466,76 +557,6 @@ static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, but since MARC-8 to UTF-8 generates these composed sequence we get a better chance of a successful MARC-8 -> ISO-8859-1 conversion */ - static struct { - unsigned long x1, x2; - unsigned y; - } latin1_comb[] = { - { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */ - { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */ - { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ - { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */ - { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ - { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ - /* no need for 0xc6 LATIN CAPITAL LETTER AE */ - { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */ - { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */ - { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */ - { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ - { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ - { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */ - { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */ - { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ - { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ - { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */ - { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */ - { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */ - { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ - { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */ - { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ - /* omitted: 0xd7 MULTIPLICATION SIGN */ - /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */ - { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */ - { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */ - { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ - { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ - { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */ - /* omitted: 0xde LATIN CAPITAL LETTER THORN */ - /* omitted: 0xdf LATIN SMALL LETTER SHARP S */ - { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */ - { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */ - { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ - { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */ - { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */ - { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */ - /* omitted: 0xe6 LATIN SMALL LETTER AE */ - { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */ - { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */ - { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */ - { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ - { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */ - { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */ - { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */ - { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ - { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */ - /* omitted: 0xf0 LATIN SMALL LETTER ETH */ - { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */ - { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */ - { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */ - { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ - { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */ - { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */ - /* omitted: 0xf7 DIVISION SIGN */ - /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */ - { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */ - { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */ - { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ - { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */ - { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */ - /* omitted: 0xfe LATIN SMALL LETTER THORN */ - { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */ - - { 0, 0, 0} - }; unsigned char *outp = (unsigned char *) *outbuf; if (cd->compose_char) @@ -880,6 +901,8 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->read_handle = yaz_read_UCS4LE; else if (!yaz_matchstr(fromcode, "MARC8")) cd->read_handle = yaz_read_marc8; + else if (!yaz_matchstr(fromcode, "MARC8s")) + cd->read_handle = yaz_read_marc8s; #if HAVE_WCHAR_H else if (!yaz_matchstr(fromcode, "WCHAR_T")) cd->read_handle = yaz_read_wchar_t; @@ -895,6 +918,8 @@ yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) cd->write_handle = yaz_write_UCS4LE; else if (!yaz_matchstr(tocode, "MARC8")) cd->write_handle = yaz_write_marc8; + else if (!yaz_matchstr(tocode, "MARC8s")) + cd->write_handle = yaz_write_marc8; #if HAVE_WCHAR_H else if (!yaz_matchstr(tocode, "WCHAR_T")) cd->write_handle = yaz_write_wchar_t; diff --git a/test/tsticonv.c b/test/tsticonv.c index fd8f2c8..d95d798 100644 --- a/test/tsticonv.c +++ b/test/tsticonv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2005, Index Data ApS * See the file LICENSE for details. * - * $Id: tsticonv.c,v 1.17 2006-04-19 23:15:40 adam Exp $ + * $Id: tsticonv.c,v 1.18 2006-04-24 23:21:26 adam Exp $ */ #if HAVE_CONFIG_H @@ -54,205 +54,193 @@ static int compare_buffers(char *msg, int no, return 0; } -/* some test strings in ISO-8859-1 format */ -static const char *iso_8859_1_a[] = { - "ax" , - "\xd8", - "eneb\346r", - "\xe5" "\xd8", - "\xe5" "\xd8" "b", - "\xe5" "\xe5", - 0 }; - -/* same test strings in MARC-8 format */ -static const char *marc8_a[] = { - "ax", - "\xa2", /* latin capital letter o with stroke */ - "eneb\xb5r", /* latin small letter ae */ - "\xea" "a\xa2", - "\xea" "a\xa2" "b", - "\xea" "a" "\xea" "a", - 0 -}; - -static void tst_marc8_to_iso_8859_1() +static int tst_convert_l(yaz_iconv_t cd, size_t in_len, const char *in_buf, + size_t expect_len, const char *expect_buf) { - int i; - yaz_iconv_t cd; - int ret; + size_t r; + char *inbuf= (char*) in_buf; + size_t inbytesleft = in_len > 0 ? in_len : strlen(in_buf); + char outbuf0[64]; + char *outbuf = outbuf0; - cd = yaz_iconv_open("ISO-8859-1", "MARC8"); - YAZ_CHECK(cd); - if (!cd) - return; - for (i = 0; iso_8859_1_a[i]; i++) + while (inbytesleft) { - size_t r; - char *inbuf= (char*) marc8_a[i]; - size_t inbytesleft = strlen(inbuf); - char outbuf0[32]; - char *outbuf = outbuf0; - size_t outbytesleft = sizeof(outbuf0); - + size_t outbytesleft = outbuf0 + sizeof(outbuf0) - outbuf; + if (outbytesleft > 12) + outbytesleft = 12; r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - YAZ_CHECK(r != (size_t)(-1)); if (r == (size_t) (-1)) + { + int e = yaz_iconv_error(cd); + if (e != YAZ_ICONV_E2BIG) + return 0; + } + else break; - - ret = compare_buffers("tsticonv 11", i, - strlen(iso_8859_1_a[i]), iso_8859_1_a[i], - outbuf - outbuf0, outbuf0); - YAZ_CHECK(ret); } - yaz_iconv_close(cd); + return compare_buffers("tsticonv 22", 0, + expect_len, expect_buf, + outbuf - outbuf0, outbuf0); } -static void tst_marc8_to_ucs4b() +static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) { - static struct { - const char *marc8_b; - int len; - const char *ucs4_b; - } ar[] = { - { - "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o", - 8, "\x00\x00\xFF\x1F" "\x00\x00\x00o" - }, { - "\033$1" "\x6F\x77\x29" /* AE0E */ "\x6F\x52\x7C" /* c0F4 */ "\033(B", - 8, "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4", - }, { - "\033$1" - "\x21\x50\x6E" /* UCS 7CFB */ - "\x21\x51\x31" /* UCS 7D71 */ - "\x21\x3A\x67" /* UCS 5B89 */ - "\x21\x33\x22" /* UCS 5168 */ - "\x21\x33\x53" /* UCS 5206 */ - "\x21\x44\x2B" /* UCS 6790 */ - "\033(B", - 24, "\x00\x00\x7C\xFB" - "\x00\x00\x7D\x71" - "\x00\x00\x5B\x89" - "\x00\x00\x51\x68" - "\x00\x00\x52\x06" - "\x00\x00\x67\x90" - }, { - "\xB0\xB2", /* AYN and oSLASH */ - 8, "\x00\x00\x02\xBB" "\x00\x00\x00\xF8" - }, { - "\xF6\x61", /* a underscore */ - 8, "\x00\x00\x00\x61" "\x00\x00\x03\x32" - }, { - "\x61\xC2", /* a, phonorecord mark */ - 8, "\x00\x00\x00\x61" "\x00\x00\x21\x17" - }, - { /* bug #258 */ - "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */ - 32, - "\x00\x00\x00" "e" - "\x00\x00\x00" "l" - "\x00\x00\x00" "a" - "\x00\x00\x03\x08" - "\x00\x00\x00" "m" - "\x00\x00\x00" "a" - "\x00\x00\x03\x08" - "\x00\x00\x00" "n" - }, - { /* bug #260 */ - "\xe5\xe8\x41", - 12, "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08" - }, - { /* bug #416 */ - "\xEB\x74\xEC\x73", - 12, "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73" - }, - { /* bug #416 */ - "\xFA\x74\xFB\x73", - 12, "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73" - }, + int ret = 0; + WRBUF b = wrbuf_alloc(); + char outbuf[12]; + size_t inbytesleft = strlen(buf); + const char *inp = buf; + while (inbytesleft) { - 0, 0, 0 + size_t outbytesleft = sizeof(outbuf); + char *outp = outbuf; + size_t r = yaz_iconv(cd, (char**) &inp, &inbytesleft, + &outp, &outbytesleft); + if (r == (size_t) (-1)) + { + int e = yaz_iconv_error(cd); + if (e != YAZ_ICONV_E2BIG) + break; + } + wrbuf_write(b, outbuf, outp - outbuf); } - }; - int i; - int ret; - yaz_iconv_t cd; + if (wrbuf_len(b) == strlen(cmpbuf) + && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b))) + ret = 1; + else + yaz_log(YLOG_LOG, "GOT (%.*s)", wrbuf_len(b), wrbuf_buf(b)); + wrbuf_free(b, 1); + return ret; +} + + +/* some test strings in ISO-8859-1 format */ +static const char *iso_8859_1_a[] = { + "ax" , + "\xd8", + "eneb\346r", + "\xe5" "\xd8", + "\xe5" "\xd8" "b", + "\xe5" "\xe5", + 0 }; - cd = yaz_iconv_open("UCS4", "MARC8"); +static void tst_marc8_to_ucs4b() +{ + yaz_iconv_t cd = yaz_iconv_open("UCS4", "MARC8"); YAZ_CHECK(cd); if (!cd) return; - for (i = 0; ar[i].len; i++) - { - size_t r; - size_t expect_len = ar[i].len; - char *inbuf= (char*) ar[i].marc8_b; - size_t inbytesleft = strlen(inbuf); - char outbuf0[64]; - char *outbuf = outbuf0; + + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\033$1" "\x21\x2B\x3B" /* FF1F */ "\033(B" "o", + 8, + "\x00\x00\xFF\x1F" "\x00\x00\x00o")); + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\033$1" "\x6F\x77\x29" /* AE0E */ + "\x6F\x52\x7C" /* c0F4 */ "\033(B", + 8, + "\x00\x00\xAE\x0E" "\x00\x00\xC0\xF4")); + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\033$1" + "\x21\x50\x6E" /* UCS 7CFB */ + "\x21\x51\x31" /* UCS 7D71 */ + "\x21\x3A\x67" /* UCS 5B89 */ + "\x21\x33\x22" /* UCS 5168 */ + "\x21\x33\x53" /* UCS 5206 */ + "\x21\x44\x2B" /* UCS 6790 */ + "\033(B", + 24, + "\x00\x00\x7C\xFB" + "\x00\x00\x7D\x71" + "\x00\x00\x5B\x89" + "\x00\x00\x51\x68" + "\x00\x00\x52\x06" + "\x00\x00\x67\x90")); + + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\xB0\xB2", /* AYN and oSLASH */ + 8, + "\x00\x00\x02\xBB" "\x00\x00\x00\xF8")); + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\xF6\x61", /* a underscore */ + 8, + "\x00\x00\x00\x61" "\x00\x00\x03\x32")); + + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\x61\xC2", /* a, phonorecord mark */ + 8, + "\x00\x00\x00\x61" "\x00\x00\x21\x17")); + + /* bug #258 */ + YAZ_CHECK(tst_convert_l( + cd, + 0, + "el" "\xe8" "am\xe8" "an", /* elaman where a is a" */ + 32, + "\x00\x00\x00" "e" + "\x00\x00\x00" "l" + "\x00\x00\x00" "a" + "\x00\x00\x03\x08" + "\x00\x00\x00" "m" + "\x00\x00\x00" "a" + "\x00\x00\x03\x08" + "\x00\x00\x00" "n")); + /* bug #260 */ + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\xe5\xe8\x41", + 12, + "\x00\x00\x00\x41" "\x00\x00\x03\x04" "\x00\x00\x03\x08")); + /* bug #416 */ + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\xEB\x74\xEC\x73", + 12, + "\x00\x00\x00\x74" "\x00\x00\x03\x61" "\x00\x00\x00\x73")); + /* bug #416 */ + YAZ_CHECK(tst_convert_l( + cd, + 0, + "\xFA\x74\xFB\x73", + 12, + "\x00\x00\x00\x74" "\x00\x00\x03\x60" "\x00\x00\x00\x73")); - while (inbytesleft) - { - size_t outbytesleft = outbuf0 + sizeof(outbuf0) - outbuf; - if (outbytesleft > 12) - outbytesleft = 12; - r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (r == (size_t) (-1)) - { - int e = yaz_iconv_error(cd); - YAZ_CHECK(e == YAZ_ICONV_E2BIG); - if (e != YAZ_ICONV_E2BIG) - return; - } - else - break; - } - ret = compare_buffers("tsticonv 22", i, - expect_len, ar[i].ucs4_b, - outbuf - outbuf0, outbuf0); - YAZ_CHECK(ret); - } yaz_iconv_close(cd); } static void tst_ucs4b_to_utf8() { - static const char *ucs4_c[] = { - "\x00\x00\xFF\x1F\x00\x00\x00o", - "\x00\x00\xAE\x0E\x00\x00\xC0\xF4", - 0 - }; - static const char *utf8_c[] = { - "\xEF\xBC\x9F\x6F", - "\xEA\xB8\x8E\xEC\x83\xB4", - 0 - }; - - int i; - int ret; - yaz_iconv_t cd; - - cd = yaz_iconv_open("UTF8", "UCS4"); + yaz_iconv_t cd = yaz_iconv_open("UTF8", "UCS4"); YAZ_CHECK(cd); if (!cd) return; - for (i = 0; ucs4_c[i]; i++) - { - size_t r; - char *inbuf= (char*) ucs4_c[i]; - size_t inbytesleft = 8; - char outbuf0[24]; - char *outbuf = outbuf0; - size_t outbytesleft = sizeof(outbuf0); - - r = yaz_iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - YAZ_CHECK(r != (size_t) (-1)); - if (r == (size_t) (-1)) - return; - ret = compare_buffers("tsticonv 32", i, - strlen(utf8_c[i]), utf8_c[i], - outbuf - outbuf0, outbuf0); - YAZ_CHECK(ret); - } + YAZ_CHECK(tst_convert_l( + cd, + 8, + "\x00\x00\xFF\x1F\x00\x00\x00o", + 4, + "\xEF\xBC\x9F\x6F")); + + YAZ_CHECK(tst_convert_l( + cd, + 8, + "\x00\x00\xAE\x0E\x00\x00\xC0\xF4", + 6, + "\xEA\xB8\x8E\xEC\x83\xB4")); yaz_iconv_close(cd); } @@ -349,37 +337,41 @@ int utf8_check(unsigned c) return 1; } -static int tst_convert(yaz_iconv_t cd, const char *buf, const char *cmpbuf) +static void tst_marc8_to_utf8() { - int ret = 0; - WRBUF b = wrbuf_alloc(); - char outbuf[12]; - size_t inbytesleft = strlen(buf); - const char *inp = buf; - while (inbytesleft) - { - size_t outbytesleft = sizeof(outbuf); - char *outp = outbuf; - size_t r = yaz_iconv(cd, (char**) &inp, &inbytesleft, - &outp, &outbytesleft); - if (r == (size_t) (-1)) - { - int e = yaz_iconv_error(cd); - if (e != YAZ_ICONV_E2BIG) - break; - } - wrbuf_write(b, outbuf, outp - outbuf); - } - if (wrbuf_len(b) == strlen(cmpbuf) - && !memcmp(cmpbuf, wrbuf_buf(b), wrbuf_len(b))) - ret = 1; - else - yaz_log(YLOG_LOG, "GOT (%.*s)", wrbuf_len(b), wrbuf_buf(b)); - wrbuf_free(b, 1); - return ret; + yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8"); + + YAZ_CHECK(cd); + if (!cd) + return; + + YAZ_CHECK(tst_convert(cd, "Cours de math", + "Cours de math")); + /* COMBINING ACUTE ACCENT */ + YAZ_CHECK(tst_convert(cd, "Cours de mathâe", + "Cours de mathe\xcc\x81")); + yaz_iconv_close(cd); +} + +static void tst_marc8s_to_utf8() +{ + yaz_iconv_t cd = yaz_iconv_open("UTF-8", "MARC8s"); + + YAZ_CHECK(cd); + if (!cd) + return; + + YAZ_CHECK(tst_convert(cd, "Cours de math", + "Cours de math")); + /* E9: LATIN SMALL LETTER E WITH ACUTE */ + YAZ_CHECK(tst_convert(cd, "Cours de mathâe", + "Cours de math\xc3\xa9")); + + yaz_iconv_close(cd); } -static void tst_conversion_marc8_to_latin1() + +static void tst_marc8_to_latin1() { yaz_iconv_t cd = yaz_iconv_open("ISO-8859-1", "MARC8"); @@ -387,6 +379,20 @@ static void tst_conversion_marc8_to_latin1() if (!cd) return; + YAZ_CHECK(tst_convert(cd, "ax", "ax")); + + /* latin capital letter o with stroke */ + YAZ_CHECK(tst_convert(cd, "\xa2", "\xd8")); + + /* with latin small letter ae */ + YAZ_CHECK(tst_convert(cd, "eneb\xb5r", "eneb\346r")); + + YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2", "\xe5" "\xd8")); + + YAZ_CHECK(tst_convert(cd, "\xea" "a\xa2" "b", "\xe5" "\xd8" "b")); + + YAZ_CHECK(tst_convert(cd, "\xea" "a" "\xea" "a", "\xe5" "\xe5")); + YAZ_CHECK(tst_convert(cd, "Cours de math", "Cours de math")); YAZ_CHECK(tst_convert(cd, "Cours de mathâe", @@ -407,7 +413,7 @@ static void tst_conversion_marc8_to_latin1() yaz_iconv_close(cd); } -static void tst_conversion_utf8_to_marc8() +static void tst_utf8_to_marc8() { yaz_iconv_t cd = yaz_iconv_open("MARC8", "UTF-8"); @@ -455,7 +461,7 @@ static void tst_conversion_utf8_to_marc8() } -static void tst_conversion_latin1_to_marc8() +static void tst_latin1_to_marc8() { yaz_iconv_t cd = yaz_iconv_open("MARC8", "ISO-8859-1"); @@ -480,16 +486,8 @@ static void tst_conversion_latin1_to_marc8() yaz_iconv_close(cd); } -int main (int argc, char **argv) +static void tst_utf8_codes() { - YAZ_CHECK_INIT(argc, argv); - - tst_conversion_marc8_to_latin1(); - - tst_conversion_utf8_to_marc8(); - - tst_conversion_latin1_to_marc8(); - YAZ_CHECK(utf8_check(3)); YAZ_CHECK(utf8_check(127)); YAZ_CHECK(utf8_check(128)); @@ -502,15 +500,32 @@ int main (int argc, char **argv) YAZ_CHECK(utf8_check(1000000)); YAZ_CHECK(utf8_check(10000000)); YAZ_CHECK(utf8_check(100000000)); +} + +int main (int argc, char **argv) +{ + YAZ_CHECK_INIT(argc, argv); + + tst_utf8_codes(); + + tst_marc8_to_utf8(); + + tst_marc8s_to_utf8(); + + tst_marc8_to_latin1(); + + tst_utf8_to_marc8(); + + tst_latin1_to_marc8(); + + tst_marc8_to_ucs4b(); + tst_ucs4b_to_utf8(); dconvert(1, "UTF-8"); dconvert(1, "ISO-8859-1"); dconvert(1, "UCS4"); dconvert(1, "UCS4LE"); dconvert(0, "CP865"); - tst_marc8_to_iso_8859_1(); - tst_marc8_to_ucs4b(); - tst_ucs4b_to_utf8(); YAZ_CHECK_TERM; } -- 1.7.10.4