marc8 + ISO2709 checks for leader 9 YAZ-800
authorAdam Dickmeiss <adam@indexdata.dk>
Fri, 14 Nov 2014 14:03:08 +0000 (15:03 +0100)
committerAdam Dickmeiss <adam@indexdata.dk>
Fri, 14 Nov 2014 14:05:25 +0000 (15:05 +0100)
include/yaz/marcdisp.h
src/marcdisp.c
src/opac_to_xml.c
src/record_conv.c
src/record_render.c
test/test_record_conv.c
util/marcdump.c

index d7bbaa0..fb0bab8 100644 (file)
@@ -484,6 +484,20 @@ struct json_node;
 
 YAZ_EXPORT int yaz_marc_read_json_node(yaz_marc_t mt, struct json_node *n);
 
+/** \brief check if MARC21 is UTF-8 encoded
+    \param charset that is given by user
+    \param marc_buf ISO2709 buf
+    \param sz ISO2709 size
+    \retval 1 is probably UTF-8
+    \retval 0 is not UTF-8
+*/
+YAZ_EXPORT
+int yaz_marc_check_marc21_coding(const char *charset,
+                                 const char *marc_buf, int sz);
+
+YAZ_EXPORT
+int yaz_opac_check_marc21_coding(const char *charset, Z_OPACRecord *r);
+
 YAZ_END_CDECL
 
 #endif
index 3271889..85acb07 100644 (file)
@@ -1465,6 +1465,16 @@ void yaz_marc_write_using_libxml2(yaz_marc_t mt, int enable)
     mt->write_using_libxml2 = enable;
 }
 
+int yaz_marc_check_marc21_coding(const char *charset,
+                                 const char *marc_buf, int sz)
+{
+    if ((!yaz_matchstr(charset, "MARC8?") ||
+         !yaz_matchstr(charset, "MARC8"))  && marc_buf && sz > 25
+        && marc_buf[9] == 'a')
+        return 1;
+    return 0;
+}
+
 /*
  * Local variables:
  * c-basic-offset: 4
index 1f0c9a3..f365082 100644 (file)
@@ -191,6 +191,22 @@ void yaz_opac_decode_wrbuf(yaz_marc_t mt, Z_OPACRecord *r, WRBUF wrbuf)
     yaz_opac_decode_wrbuf2(mt, r, wrbuf, 0);
 }
 
+int yaz_opac_check_marc21_coding(const char *charset, Z_OPACRecord *r)
+{
+    if (r->bibliographicRecord)
+    {
+        Z_External *ext = r->bibliographicRecord;
+        if (ext->which == Z_External_octet)
+        {
+            return yaz_marc_check_marc21_coding(
+                charset,
+                (const char *) ext->u.octet_aligned->buf,
+                ext->u.octet_aligned->len);
+        }
+    }
+    return 0;
+}
+
 /*
  * Local variables:
  * c-basic-offset: 4
index e67ef00..9ab5b71 100644 (file)
@@ -519,23 +519,25 @@ static void *construct_marc(const xmlNode *ptr,
 static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
 {
     struct marc_info *mi = info;
+    const char *input_charset = mi->input_charset;
     int ret = 0;
-
-    yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, mi->input_charset);
     yaz_marc_t mt = yaz_marc_create();
 
     yaz_marc_xml(mt, mi->output_format_mode);
     if (mi->leader_spec)
         yaz_marc_leader_spec(mt, mi->leader_spec);
 
-    if (cd)
-        yaz_marc_iconv(mt, cd);
     if (mi->input_format_mode == YAZ_MARC_ISO2709)
     {
         int sz = yaz_marc_read_iso2709(mt, wrbuf_buf(record),
                                        wrbuf_len(record));
         if (sz > 0)
+        {
+            if (yaz_marc_check_marc21_coding(input_charset, wrbuf_buf(record),
+                                             wrbuf_len(record)))
+                input_charset = "utf-8";
             ret = 0;
+        }
         else
             ret = -1;
     }
@@ -564,13 +566,18 @@ static int convert_marc(void *info, WRBUF record, WRBUF wr_error)
     }
     if (ret == 0)
     {
+        yaz_iconv_t cd = yaz_iconv_open(mi->output_charset, input_charset);
+
+        if (cd)
+            yaz_marc_iconv(mt, cd);
+
         wrbuf_rewind(record);
         ret = yaz_marc_write_mode(mt, record);
         if (ret)
             wrbuf_printf(wr_error, "yaz_marc_write_mode failed");
+        if (cd)
+            yaz_iconv_close(cd);
     }
-    if (cd)
-        yaz_iconv_close(cd);
     yaz_marc_destroy(mt);
     return ret;
 }
@@ -680,11 +687,15 @@ int yaz_record_conv_opac_record(yaz_record_conv_t p,
     else
     {
         struct marc_info *mi = r->info;
+        const char *input_charset = mi->input_charset;
+        yaz_iconv_t cd;
 
         WRBUF res = wrbuf_alloc();
         yaz_marc_t mt = yaz_marc_create();
-        yaz_iconv_t cd = yaz_iconv_open(mi->output_charset,
-                                        mi->input_charset);
+
+        if (yaz_opac_check_marc21_coding(input_charset, input_record))
+            input_charset = "utf-8";
+        cd = yaz_iconv_open(mi->output_charset, input_charset);
 
         wrbuf_rewind(p->wr_error);
         yaz_marc_xml(mt, mi->output_format_mode);
index 1291858..82fa698 100644 (file)
@@ -28,7 +28,9 @@
 #endif
 
 static yaz_iconv_t iconv_create_charset(const char *record_charset,
-                                        yaz_iconv_t *cd2)
+                                        yaz_iconv_t *cd2,
+                                        const char *marc_buf,
+                                        int sz)
 {
     char charset_buf[40];
     yaz_iconv_t cd = 0;
@@ -62,7 +64,11 @@ static yaz_iconv_t iconv_create_charset(const char *record_charset,
     }
 
     if (from_set1)
+    {
+        if (yaz_marc_check_marc21_coding(from_set1, marc_buf, sz))
+            from_set1 = "utf-8";
         cd = yaz_iconv_open(to_set, from_set1);
+    }
     if (cd2)
     {
         if (from_set2)
@@ -79,7 +85,7 @@ static const char *return_marc_record(WRBUF wrbuf,
                                       const char *buf, int sz,
                                       const char *record_charset)
 {
-    yaz_iconv_t cd = iconv_create_charset(record_charset, 0);
+    yaz_iconv_t cd = iconv_create_charset(record_charset, 0, buf, sz);
     yaz_marc_t mt = yaz_marc_create();
     const char *ret_string = 0;
 
@@ -103,10 +109,22 @@ static const char *return_opac_record(WRBUF wrbuf,
                                       Z_OPACRecord *opac_rec,
                                       const char *record_charset)
 {
-    yaz_iconv_t cd2;
-    yaz_iconv_t cd = iconv_create_charset(record_charset, &cd2);
+    yaz_iconv_t cd, cd2;
+    const char *marc_buf = 0;
+    int marc_sz = 0;
     yaz_marc_t mt = yaz_marc_create();
 
+    if (opac_rec->bibliographicRecord)
+    {
+        Z_External *ext = opac_rec->bibliographicRecord;
+        if (ext->which == Z_External_octet)
+        {
+            marc_buf = (const char *) ext->u.octet_aligned->buf;
+            marc_sz = ext->u.octet_aligned->len;
+        }
+    }
+    cd = iconv_create_charset(record_charset, &cd2, marc_buf, marc_sz);
+
     if (cd)
         yaz_marc_iconv(mt, cd);
     yaz_marc_xml(mt, marc_type);
@@ -131,7 +149,7 @@ static const char *return_string_record(WRBUF wrbuf,
                                         const char *buf, int sz,
                                         const char *record_charset)
 {
-    yaz_iconv_t cd = iconv_create_charset(record_charset, 0);
+    yaz_iconv_t cd = iconv_create_charset(record_charset, 0, 0, 0);
 
     if (cd)
     {
index f388ea4..ff9a2a8 100644 (file)
@@ -369,7 +369,7 @@ static void tst_convert3(void)
     yaz_record_conv_t p = 0;
 
     const char *iso2709_rec =
-        "\x30\x30\x30\x37\x37\x6E\x61\x6D\x20\x61\x32\x32\x30\x30\x30\x34"
+        "\x30\x30\x30\x37\x37\x6E\x61\x6D\x20\x20\x32\x32\x30\x30\x30\x34"
         "\x39\x38\x61\x20\x34\x35\x30\x30\x30\x30\x31\x30\x30\x31\x33\x30"
         "\x30\x30\x30\x30\x30\x31\x30\x30\x30\x31\x34\x30\x30\x30\x31\x33"
         "\x1E\x20\x20\x20\x31\x31\x32\x32\x34\x34\x36\x36\x20\x1E\x20\x20"
index 850331b..c45d146 100644 (file)
@@ -319,6 +319,7 @@ static void dump(const char *fname, const char *from, const char *to,
             size_t len_result;
             size_t r;
             char buf[100001];
+            yaz_iconv_t cd1 = 0;
 
             r = fread(buf, 1, 5, inf);
             if (r < 5)
@@ -428,7 +429,21 @@ static void dump(const char *fname, const char *from, const char *to,
                 }
             }
             len_result = rlen;
+
+            if (yaz_marc_check_marc21_coding(from, buf, 26))
+            {
+                cd1 = yaz_iconv_open(to, "utf-8");
+                if (cd1)
+                    yaz_marc_iconv(mt, cd);
+            }
             r = yaz_marc_decode_buf(mt, buf, -1, &result, &len_result);
+
+            if (cd1)
+            {
+                yaz_iconv_close(cd1);
+                yaz_marc_iconv(mt, cd);
+            }
+
             if (r == -1)
                 no_errors++;
             if (r > 0 && result && len_result)