From 7dec30565506b5ecdd449866ebabe67bd816fc59 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Tue, 27 Aug 2002 14:02:13 +0000 Subject: [PATCH] Simple iconv library --- include/yaz/yaz-util.h | 20 ++- util/Makefile.am | 10 +- util/siconv.c | 315 ++++++++++++++++++++++++++++++++++++++++++++++++ util/siconvtst.c | 132 ++++++++++++++++++++ 4 files changed, 472 insertions(+), 5 deletions(-) create mode 100644 util/siconv.c create mode 100644 util/siconvtst.c diff --git a/include/yaz/yaz-util.h b/include/yaz/yaz-util.h index 5030bff..0155c0b 100644 --- a/include/yaz/yaz-util.h +++ b/include/yaz/yaz-util.h @@ -4,7 +4,10 @@ * Sebastian Hammer, Adam Dickmeiss * * $Log: yaz-util.h,v $ - * Revision 1.3 2001-04-06 12:26:46 adam + * Revision 1.4 2002-08-27 14:02:43 adam + * Simple iconv library + * + * Revision 1.3 2001/04/06 12:26:46 adam * Optional CCL module. Moved atoi_n to marcdisp.h from yaz-util.h. * * Revision 1.2 2000/02/28 11:20:06 adam @@ -48,8 +51,23 @@ YAZ_BEGIN_CDECL +typedef struct yaz_iconv_struct *yaz_iconv_t; +#define YAZ_ICONV_UNKNOWN 1 +#define YAZ_ICONV_E2BIG 2 +#define YAZ_ICONV_EILSEQ 3 +#define YAZ_ICONV_EINVAL 4 + +YAZ_EXPORT yaz_iconv_t yaz_iconv_open (const char *tocode, + const char *fromcode); +YAZ_EXPORT size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft); +YAZ_EXPORT int yaz_iconv_error (yaz_iconv_t cd); + +YAZ_EXPORT int yaz_iconv_close (yaz_iconv_t cd); + YAZ_EXPORT int yaz_matchstr(const char *s1, const char *s2); + YAZ_END_CDECL #endif diff --git a/util/Makefile.am b/util/Makefile.am index 5d90821..33cab0c 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1,6 +1,6 @@ ## Copyright (C) 1994-2001, Index Data ## All rights reserved. -## $Id: Makefile.am,v 1.11 2002-04-15 09:44:44 adam Exp $ +## $Id: Makefile.am,v 1.12 2002-08-27 14:02:13 adam Exp $ noinst_LTLIBRARIES = libutil.la @@ -14,13 +14,15 @@ LIBS = AM_CPPFLAGS=-I$(top_srcdir)/include -noinst_PROGRAMS = marcdump +noinst_PROGRAMS = marcdump yaziconv marcdump_LDADD = libutil.la - marcdump_SOURCES = marcdump.c +yaziconv_LDADD = libutil.la +yaziconv_SOURCES = siconvtst.c + libutil_la_SOURCES=options.c log.c marcdisp.c oid.c wrbuf.c nmemsdup.c \ - xmalloc.c readconf.c tpath.c nmem.c matchstr.c atoin.c + xmalloc.c readconf.c tpath.c nmem.c matchstr.c atoin.c siconv.c #libyazthread_la_SOURCES=nmemthread.c xmalloc.c log.c diff --git a/util/siconv.c b/util/siconv.c new file mode 100644 index 0000000..a01b103 --- /dev/null +++ b/util/siconv.c @@ -0,0 +1,315 @@ +/* + * Copyright (c) 1997-2002, Index Data + * See the file LICENSE for details. + * + * $Id: siconv.c,v 1.1 2002-08-27 14:02:13 adam Exp $ + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +#if HAVE_ICONV_H +#include +#endif + +#include + +struct yaz_iconv_struct { + int my_errno; + unsigned long (*read_handle)(yaz_iconv_t cd, char **inbuf, + size_t *inbytesleft); + size_t (*write_handle)(yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft); +#if HAVE_ICONV_H + iconv_t iconv_cd; +#endif +}; + + +static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, + char **inbuf, size_t *inbytesleft) +{ + unsigned char *inp = *inbuf; + unsigned long x = 0; + x = inp[0]; + (*inbytesleft)--; + inp++; + *inbuf = inp; + return x; +} + +static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, + char **inbuf, size_t *inbytesleft) +{ + unsigned char *inp = *inbuf; + unsigned long x = 0; + if (inp[0] <= 0x7f) + { + x = inp[0]; + + (*inbytesleft)--; + inp++; + } + else if (inp[0] <= 0xdf && *inbytesleft >= 2) + { + x = ((inp[0] & 0x1f) << 6) + (inp[1] & 0x3f); + + (*inbytesleft) -= 2; + inp += 2; + } + else if (inp[0] <= 0xef && *inbytesleft >= 3) + { + x = ((inp[0] & 0x0f) << 12) + + ((inp[1] & 0x3f) << 6) + (inp[1] & 0x3f); + + (*inbytesleft) -= 3; + inp += 3; + } + else if (inp[0] <= 0xef && *inbytesleft >= 4) + { + x = ((inp[0] & 0x07) << 18) + + ((inp[1] & 0x3f) << 12) + ((inp[2] & 0x3f) << 6) + + (inp[3] & 0x3f); + + (*inbytesleft) -= 4; + inp += 4; + } + else + { + cd->my_errno = YAZ_ICONV_EINVAL; + } + *inbuf = inp; + return x; +} + +static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, + char **inbuf, size_t *inbytesleft) +{ + unsigned char *inp = *inbuf; + unsigned long x = 0; + + if (*inbytesleft < 4) + { + cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */ + return 0; + } + memcpy (&x, inp, sizeof(x)); + (*inbytesleft) -= 4; + inp += 4; + *inbuf = inp; + return x; +} + +static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + unsigned char *outp = *outbuf; + if (x <= 0x7f && *outbytesleft >= 1) + { + *outp++ = x; + (*outbytesleft)--; + } + else if (x <= 0x7ff && *outbytesleft >= 2) + { + *outp++ = (x >> 6) | 0xc0; + *outp++ = (x & 0x3f) | 0x80; + (*outbytesleft) -= 2; + } + else if (x <= 0xffff && *outbytesleft >= 3) + { + *outp++ = (x >> 12) | 0xe0; + *outp++ = ((x >> 6) & 0x3f) | 0x80; + *outp++ = (x & 0x3f) | 0x80; + (*outbytesleft) -= 3; + } + else if (x <= 0x1fffff && *outbytesleft >= 4) + { + *outp++ = (x >> 18) | 0xf0; + *outp++ = ((x >> 12) & 0x3f) | 0x80; + *outp++ = ((x >> 6) & 0x3f) | 0x80; + *outp++ = (x & 0x3f) | 0x80; + (*outbytesleft) -= 4; + } + else if (x > 0x1fffff) + { + cd->my_errno = YAZ_ICONV_EILSEQ; /* invalid sequence */ + return (size_t)(-1); + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; /* not room for output */ + return (size_t)(-1); + } + *outbuf = outp; + return 0; +} + +static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + unsigned char *outp = *outbuf; + if (x > 255 || x < 1) + { + cd->my_errno = YAZ_ICONV_EILSEQ; + return (size_t) -1; + } + else if (*outbytesleft >= 1) + { + *outp++ = x; + (*outbytesleft)--; + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t)(-1); + } + *outbuf = outp; + return 0; +} + + +static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x, + char **outbuf, size_t *outbytesleft) +{ + unsigned char *outp = *outbuf; + if (x < 1 || x > 0x1fffff) + { + cd->my_errno = YAZ_ICONV_EILSEQ; + return (size_t)(-1); + } + else if (*outbytesleft >= 4) + { + memcpy (outp, &x, sizeof(x)); + outp += 4; + (*outbytesleft) -= 4; + } + else + { + cd->my_errno = YAZ_ICONV_E2BIG; + return (size_t)(-1); + } + *outbuf = outp; + return 0; +} + +yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode) +{ + yaz_iconv_t cd = xmalloc (sizeof(*cd)); + + cd->write_handle = 0; + cd->read_handle = 0; + cd->my_errno = YAZ_ICONV_UNKNOWN; + + if (!strcmp(fromcode, "UTF-8")) + cd->read_handle = yaz_read_UTF8; + else if (!strcmp(fromcode, "ISO-8859-1")) + cd->read_handle = yaz_read_ISO8859_1; + else if (!strcmp(fromcode, "UCS-4")) + cd->read_handle = yaz_read_UCS4; + + + if (!strcmp(tocode, "UTF-8")) + cd->write_handle = yaz_write_UTF8; + else if (!strcmp (tocode, "ISO-8859-1")) + cd->write_handle = yaz_write_ISO8859_1; + else if (!strcmp (tocode, "UCS-4")) + cd->write_handle = yaz_write_UCS4; + +#if HAVE_ICONV_H + cd->iconv_cd = 0; + if (!cd->read_handle || !cd->write_handle) + { + cd->iconv_cd = iconv_open (tocode, fromcode); + if (cd->iconv_cd == (iconv_t) (-1)) + { + xfree (cd); + return 0; + } + } +#else + if (!cd->to_UCS4 || !cd->from_UCS4) + { + xfree (cd); + return 0; + } +#endif + return cd; +} + +size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft, + char **outbuf, size_t *outbytesleft) +{ + char *inbuf0; + size_t r = 0; +#if HAVE_ICONV_H + if (cd->iconv_cd) + { + size_t r = + iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft); + if (r == (size_t)(-1)) + { + switch (errno) + { + case E2BIG: + cd->my_errno = YAZ_ICONV_E2BIG; + break; + case EINVAL: + cd->my_errno = YAZ_ICONV_EINVAL; + break; + case EILSEQ: + cd->my_errno = YAZ_ICONV_EILSEQ; + break; + default: + cd->my_errno = YAZ_ICONV_UNKNOWN; + } + } + return r; + } +#endif + if (inbuf == 0 || *inbuf == 0) + return 0; + inbuf0 = *inbuf; + while (1) + { + unsigned long x; + + if (*inbytesleft == 0) + { + r = *inbuf - inbuf0; + break; + } + + x = (cd->read_handle)(cd, inbuf, inbytesleft); + if (x == 0) + { + r = (size_t)(-1); + break; + } + r = (cd->write_handle)(cd, x, outbuf, outbytesleft); + if (r) + break; + } + return r; +} + +int yaz_iconv_error (yaz_iconv_t cd) +{ + return cd->my_errno; +} + +int yaz_iconv_close (yaz_iconv_t cd) +{ +#if HAVE_ICONV_H + if (cd->iconv_cd) + iconv_close (cd->iconv_cd); +#endif + xfree (cd); + return 0; +} + + diff --git a/util/siconvtst.c b/util/siconvtst.c new file mode 100644 index 0000000..c06befe --- /dev/null +++ b/util/siconvtst.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 1997-2002, Index Data + * See the file LICENSE for details. + * + * $Id: siconvtst.c,v 1.1 2002-08-27 14:02:13 adam Exp $ + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +#include + +#define CHUNK 8 + +static void convert (FILE *inf, yaz_iconv_t cd) +{ + char inbuf0[CHUNK], *inbuf = inbuf0; + char outbuf0[CHUNK], *outbuf = outbuf0; + size_t outbytesleft = CHUNK; + size_t inbytesleft = CHUNK; + + while (1) + { + size_t r = fread (inbuf, 1, inbytesleft, inf); + if (inbytesleft != r) + { + if (ferror(inf)) + { + fprintf (stderr, "yaziconv: error reading file\n"); + exit (6); + } + if (r == 0) + { + if (outbuf != outbuf0) + fwrite (outbuf0, 1, outbuf - outbuf0, stdout); + break; + } + } + r = yaz_iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); + if (r == (size_t)(-1)) + { + if (yaz_iconv_error(cd) == YAZ_ICONV_EILSEQ) + { + fprintf (stderr, "invalid sequence\n"); + return ; + } + + if (yaz_iconv_error(cd) == EINVAL) /* incomplete input */ + { + size_t i; + for (i = 0; i