* Copyright (c) 1997-2002, Index Data
* See the file LICENSE for details.
*
- * $Id: siconv.c,v 1.1 2002-08-27 14:02:13 adam Exp $
+ * $Id: siconv.c,v 1.2 2002-08-27 21:45:28 adam Exp $
*/
+/* mini iconv and wrapper for system iconv library (if present) */
+
#if HAVE_CONFIG_H
#include <config.h>
#endif
struct yaz_iconv_struct {
int my_errno;
- unsigned long (*read_handle)(yaz_iconv_t cd, char **inbuf,
- size_t *inbytesleft);
+ int init_flag;
+ size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
+ size_t inbytesleft, size_t *no_read);
+ unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
+ size_t inbytesleft, size_t *no_read);
size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
char **outbuf, size_t *outbytesleft);
#if HAVE_ICONV_H
#endif
};
-
-static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd,
- char **inbuf, size_t *inbytesleft)
+static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
{
- unsigned char *inp = *inbuf;
- unsigned long x = 0;
- x = inp[0];
- (*inbytesleft)--;
- inp++;
- *inbuf = inp;
+ unsigned long x = inp[0];
+ *no_read = 1;
return x;
}
-static unsigned long yaz_read_UTF8 (yaz_iconv_t cd,
- char **inbuf, size_t *inbytesleft)
+static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
+{
+ if (inp[0] != 0xef)
+ {
+ *no_read = 0;
+ return 0;
+ }
+ if (inbytesleft < 3)
+ {
+ cd->my_errno = YAZ_ICONV_EINVAL;
+ return (size_t) -1;
+ }
+ if (inp[1] != 0xbb || inp[2] != 0xbf)
+ {
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ return (size_t) -1;
+ }
+ *no_read = 3;
+ return 0;
+}
+
+static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
{
- unsigned char *inp = *inbuf;
unsigned long x = 0;
+
if (inp[0] <= 0x7f)
{
x = inp[0];
-
- (*inbytesleft)--;
- inp++;
+ *no_read = 1;
}
- else if (inp[0] <= 0xdf && *inbytesleft >= 2)
+ else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
{
- x = ((inp[0] & 0x1f) << 6) + (inp[1] & 0x3f);
-
- (*inbytesleft) -= 2;
- inp += 2;
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
}
- else if (inp[0] <= 0xef && *inbytesleft >= 3)
+ else if (inp[0] <= 0xdf && inbytesleft >= 2)
{
- x = ((inp[0] & 0x0f) << 12) +
- ((inp[1] & 0x3f) << 6) + (inp[1] & 0x3f);
-
- (*inbytesleft) -= 3;
- inp += 3;
+ x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
+ if (x >= 0x80)
+ *no_read = 2;
+ else
+ {
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ }
}
- else if (inp[0] <= 0xef && *inbytesleft >= 4)
+ else if (inp[0] <= 0xef && inbytesleft >= 3)
{
- x = ((inp[0] & 0x07) << 18) +
- ((inp[1] & 0x3f) << 12) + ((inp[2] & 0x3f) << 6) +
- (inp[3] & 0x3f);
-
- (*inbytesleft) -= 4;
- inp += 4;
+ x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
+ (inp[1] & 0x3f);
+ if (x >= 0x800)
+ *no_read = 3;
+ else
+ {
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ }
+ }
+ else if (inp[0] <= 0xf7 && inbytesleft >= 4)
+ {
+ x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
+ ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
+ if (x >= 0x10000)
+ *no_read = 4;
+ else
+ {
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ }
+ }
+ else if (inp[0] <= 0xfb && inbytesleft >= 5)
+ {
+ x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
+ ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
+ (inp[4] & 0x3f);
+ if (x >= 0x200000)
+ *no_read = 5;
+ else
+ {
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ }
+ }
+ else if (inp[0] <= 0xfd && inbytesleft >= 6)
+ {
+ x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
+ ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
+ ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
+ if (x >= 0x4000000)
+ *no_read = 6;
+ else
+ {
+ *no_read = 0;
+ cd->my_errno = YAZ_ICONV_EILSEQ;
+ }
}
else
{
+ *no_read = 0;
cd->my_errno = YAZ_ICONV_EINVAL;
}
- *inbuf = inp;
return x;
}
-static unsigned long yaz_read_UCS4 (yaz_iconv_t cd,
- char **inbuf, size_t *inbytesleft)
+static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
{
- unsigned char *inp = *inbuf;
unsigned long x = 0;
- if (*inbytesleft < 4)
+ if (inbytesleft < 4)
{
cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
- return 0;
+ *no_read = 0;
+ }
+ else
+ {
+ x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
+ *no_read = 4;
+ }
+ return x;
+}
+
+static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
+ size_t inbytesleft, size_t *no_read)
+{
+ unsigned long x = 0;
+
+ if (inbytesleft < 4)
+ {
+ cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
+ *no_read = 0;
+ }
+ else
+ {
+ x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
+ *no_read = 4;
}
- memcpy (&x, inp, sizeof(x));
- (*inbytesleft) -= 4;
- inp += 4;
- *inbuf = inp;
return x;
}
{
*outp++ = (x >> 18) | 0xf0;
*outp++ = ((x >> 12) & 0x3f) | 0x80;
- *outp++ = ((x >> 6) & 0x3f) | 0x80;
+ *outp++ = ((x >> 6) & 0x3f) | 0x80;
*outp++ = (x & 0x3f) | 0x80;
(*outbytesleft) -= 4;
}
- else if (x > 0x1fffff)
+ else if (x <= 0x3ffffff && *outbytesleft >= 5)
{
- cd->my_errno = YAZ_ICONV_EILSEQ; /* invalid sequence */
- return (size_t)(-1);
+ *outp++ = (x >> 24) | 0xf8;
+ *outp++ = ((x >> 18) & 0x3f) | 0x80;
+ *outp++ = ((x >> 12) & 0x3f) | 0x80;
+ *outp++ = ((x >> 6) & 0x3f) | 0x80;
+ *outp++ = (x & 0x3f) | 0x80;
+ (*outbytesleft) -= 5;
+ }
+ else if (*outbytesleft >= 6)
+ {
+ *outp++ = (x >> 30) | 0xfc;
+ *outp++ = ((x >> 24) & 0x3f) | 0x80;
+ *outp++ = ((x >> 18) & 0x3f) | 0x80;
+ *outp++ = ((x >> 12) & 0x3f) | 0x80;
+ *outp++ = ((x >> 6) & 0x3f) | 0x80;
+ *outp++ = (x & 0x3f) | 0x80;
+ (*outbytesleft) -= 6;
}
else
{
char **outbuf, size_t *outbytesleft)
{
unsigned char *outp = *outbuf;
- if (x < 1 || x > 0x1fffff)
+ if (*outbytesleft >= 4)
{
- cd->my_errno = YAZ_ICONV_EILSEQ;
+ *outp++ = x<<24;
+ *outp++ = x<<16;
+ *outp++ = x<<8;
+ *outp++ = x;
+ (*outbytesleft) -= 4;
+ }
+ else
+ {
+ cd->my_errno = YAZ_ICONV_E2BIG;
return (size_t)(-1);
}
- else if (*outbytesleft >= 4)
+ *outbuf = outp;
+ return 0;
+}
+
+static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
+ char **outbuf, size_t *outbytesleft)
+{
+ unsigned char *outp = *outbuf;
+ if (*outbytesleft >= 4)
{
- memcpy (outp, &x, sizeof(x));
- outp += 4;
+ *outp++ = x;
+ *outp++ = x<<8;
+ *outp++ = x<<16;
+ *outp++ = x<<24;
(*outbytesleft) -= 4;
}
else
cd->write_handle = 0;
cd->read_handle = 0;
+ cd->init_handle = 0;
cd->my_errno = YAZ_ICONV_UNKNOWN;
- if (!strcmp(fromcode, "UTF-8"))
+ if (!yaz_matchstr(fromcode, "UTF8"))
+ {
cd->read_handle = yaz_read_UTF8;
- else if (!strcmp(fromcode, "ISO-8859-1"))
+ cd->init_handle = yaz_init_UTF8;
+ }
+ else if (!yaz_matchstr(fromcode, "ISO88591"))
cd->read_handle = yaz_read_ISO8859_1;
- else if (!strcmp(fromcode, "UCS-4"))
+ else if (!yaz_matchstr(fromcode, "UCS4"))
cd->read_handle = yaz_read_UCS4;
-
-
- if (!strcmp(tocode, "UTF-8"))
+ else if (!yaz_matchstr(fromcode, "UCS4LE"))
+ cd->read_handle = yaz_read_UCS4LE;
+
+ if (!yaz_matchstr(tocode, "UTF8"))
cd->write_handle = yaz_write_UTF8;
- else if (!strcmp (tocode, "ISO-8859-1"))
+ else if (!yaz_matchstr(tocode, "ISO88591"))
cd->write_handle = yaz_write_ISO8859_1;
- else if (!strcmp (tocode, "UCS-4"))
+ else if (!yaz_matchstr (tocode, "UCS4"))
cd->write_handle = yaz_write_UCS4;
+ else if (!yaz_matchstr(tocode, "UCS4LE"))
+ cd->write_handle = yaz_write_UCS4LE;
#if HAVE_ICONV_H
cd->iconv_cd = 0;
}
}
#else
- if (!cd->to_UCS4 || !cd->from_UCS4)
+ if (!cd->read_handle || !cd->write_handle)
{
xfree (cd);
return 0;
}
#endif
+ cd->init_flag = 1;
return cd;
}
}
#endif
if (inbuf == 0 || *inbuf == 0)
+ {
+ cd->init_flag = 1;
+ cd->my_errno = YAZ_ICONV_UNKNOWN;
return 0;
+ }
inbuf0 = *inbuf;
+
+ if (cd->init_flag)
+ {
+ if (cd->init_handle)
+ {
+ size_t no_read;
+ size_t r = (cd->init_handle)(cd, *inbuf, *inbytesleft, &no_read);
+ if (r)
+ {
+ if (cd->my_errno == YAZ_ICONV_EINVAL)
+ return r;
+ cd->init_flag = 0;
+ if (cd->my_errno == YAZ_ICONV_EILSEQ)
+ {
+ *inbytesleft++;
+ (*inbuf)++;
+ }
+ return r;
+ }
+ *inbytesleft -= no_read;
+ *inbuf += no_read;
+ }
+ cd->init_flag = 0;
+ }
while (1)
{
unsigned long x;
+ size_t no_read;
if (*inbytesleft == 0)
{
break;
}
- x = (cd->read_handle)(cd, inbuf, inbytesleft);
- if (x == 0)
+ x = (cd->read_handle)(cd, *inbuf, *inbytesleft, &no_read);
+ if (no_read == 0)
{
r = (size_t)(-1);
break;
r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
if (r)
break;
+ *inbytesleft -= no_read;
+ (*inbuf) += no_read;
}
return r;
}
* Copyright (c) 1997-2002, Index Data
* See the file LICENSE for details.
*
- * $Id: siconvtst.c,v 1.2 2002-08-27 14:14:01 adam Exp $
+ * $Id: siconvtst.c,v 1.3 2002-08-27 21:45:28 adam Exp $
*/
#if HAVE_CONFIG_H
#define CHUNK 8
-static void convert (FILE *inf, yaz_iconv_t cd)
+void convert (FILE *inf, yaz_iconv_t cd)
{
char inbuf0[CHUNK], *inbuf = inbuf0;
char outbuf0[CHUNK], *outbuf = outbuf0;
size_t outbytesleft = CHUNK;
size_t inbytesleft = CHUNK;
+ int mustread = 1;
while (1)
{
- size_t r = fread (inbuf, 1, inbytesleft, inf);
- if (inbytesleft != r)
+ size_t r;
+ if (mustread)
{
- if (ferror(inf))
+ r = fread (inbuf, 1, inbytesleft, inf);
+ if (inbytesleft != r)
{
- fprintf (stderr, "yaziconv: error reading file\n");
- exit (6);
- }
- if (r == 0)
- {
- if (outbuf != outbuf0)
- fwrite (outbuf0, 1, outbuf - outbuf0, stdout);
- break;
+ if (ferror(inf))
+ {
+ fprintf (stderr, "yaziconv: error reading file\n");
+ exit (6);
+ }
+ if (r == 0)
+ {
+ if (outbuf != outbuf0)
+ fwrite (outbuf0, 1, outbuf - outbuf0, stdout);
+ break;
+ }
+ inbytesleft = r;
}
}
r = yaz_iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (r == (size_t)(-1))
{
- if (yaz_iconv_error(cd) == YAZ_ICONV_EILSEQ)
+ int e = yaz_iconv_error(cd);
+ if (e == YAZ_ICONV_EILSEQ)
{
fprintf (stderr, "invalid sequence\n");
return ;
}
-
- if (yaz_iconv_error(cd) == EINVAL) /* incomplete input */
+ else if (e == YAZ_ICONV_EINVAL) /* incomplete input */
{
size_t i;
for (i = 0; i<inbytesleft; i++)
inbuf0[i] = inbuf[i];
+ inbuf = inbuf0 + i;
inbytesleft = CHUNK - inbytesleft;
+ mustread = 1;
}
- if (yaz_iconv_error(cd) == E2BIG) /* no more output space */
+ else if (e == YAZ_ICONV_E2BIG) /* no more output space */
{
fwrite (outbuf0, 1, outbuf - outbuf0, stdout);
outbuf = outbuf0;
outbytesleft = CHUNK;
+ mustread = 0;
}
else
{
{
inbuf = inbuf0;
inbytesleft = CHUNK;
+
+ fwrite (outbuf0, 1, outbuf - outbuf0, stdout);
+ outbuf = outbuf0;
+ outbytesleft = CHUNK;
+
+ mustread = 1;
}
}
}