checked in very nice ICU normalization examples
authorMarc Cromme <marc@indexdata.dk>
Fri, 11 May 2007 22:23:33 +0000 (22:23 +0000)
committerMarc Cromme <marc@indexdata.dk>
Fri, 11 May 2007 22:23:33 +0000 (22:23 +0000)
src/test_icu_I18N.c

index 5b707ac..13ed02e 100644 (file)
@@ -1,4 +1,4 @@
-/* $Id: test_icu_I18N.c,v 1.16 2007-05-11 08:27:29 marc Exp $
+/* $Id: test_icu_I18N.c,v 1.17 2007-05-11 22:23:33 marc Exp $
    Copyright (c) 2006-2007, Index Data.
 
    This file is part of Pazpar2.
@@ -314,33 +314,15 @@ void test_icu_I18N_sortmap(int argc, char **argv)
 
 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 
-void test_icu_I18N_transliterator(int argc, char **argv)
+int test_icu_transliterator(const char * rules8cstr,
+                            const char * src8cstr,
+                            const char * chk8cstr)
 {
+    int success = 0;
     
     UErrorCode status = U_ZERO_ERROR;
     UParseError parse_error[256];
 
-    const char * rules8cstr = "[:Punctuation:] Any-Remove";
-    //const char * rules8cstr = "[:Control:] Any-Remove";
-    //const char * rules8cstr = "[:Decimal_Number:] Any-Remove\n";
-    //const char * rules8cstr = "[:Final_Punctuation:] Any-Remove";
-    //const char * rules8cstr = "Lower; [:^Letter:] Remove";
-    //const char * rules8cstr = "[:^Number:] Remove";
-    //const char * rules8cstr = "Lower;[[:WhiteSpace:][:Punctuation:]] Remove";
-    //const char * rules8cstr = "NFD; [:Nonspacing Mark:] Remove; NFC";
-             
-             
-             
-   
-    const char * src8cstr = "Genesis 1\n"
-        "The Beginning\n"
-        "1 In the beginning God created the heavens and the earth.\n"
-        "2 Now the earth was formless and empty, darkness was over "
-        "the surface of the deep, and the Spirit of God was hovering "
-        "over the waters.\n"
-        "3 And God said, Let there be light, and there was light.\n";
-
-
 
     struct icu_buf_utf16 * rules16 = icu_buf_utf16_create(0);
     struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0);
@@ -377,8 +359,19 @@ void test_icu_I18N_transliterator(int argc, char **argv)
     icu_utf16_to_utf8(dest8, src16, &status);
     icu_check_status(status);
 
-    printf("Transliterator:\n%s\n", dest8->utf8);
 
+    if(!strcmp((const char *) dest8->utf8, 
+               (const char *) chk8cstr))
+        success = 1;
+    else {
+        success = 0;
+        printf("Normaliozation;");
+        printf("Rules:      '%s'\n", rules8cstr);
+        printf("Input:      '%s'\n", src8cstr);
+        printf("Normalized: '%s'\n", dest8->utf8);
+        printf("Expected:   '%s'\n", chk8cstr);
+    }
+    
 
     utrans_close (trans);
     icu_buf_utf16_destroy(rules16);
@@ -386,21 +379,10 @@ void test_icu_I18N_transliterator(int argc, char **argv)
     icu_buf_utf16_destroy(dest16);
     icu_buf_utf8_destroy(dest8);
 
+    return success;
+}
 
 #if 0
-   
-    icu_check_status(status);
-
-
-  int32_t ustr16_lim = *ustr16_len;
-    /* Transliterate a segment of a UChar* string */
-    
-    utrans_transUChars (trans, ustr16, &*ustr16_len,
-                        ustr16_cap,
-                        0, &ustr16_lim, &status);
-    
-    utrans_close (trans);
-
     printf("\n\nUnicode Set Patterns:\n"
              "   Pattern         Description\n"
              "   Ranges          [a-z]  The lower case letters a through z\n"
@@ -445,8 +427,45 @@ void test_icu_I18N_transliterator(int argc, char **argv)
              );
 #endif
 
+
+// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
+
+void test_icu_I18N_transliterator(int argc, char **argv)
+{
+
+    YAZ_CHECK(test_icu_transliterator("[:Punctuation:] Any-Remove",
+                                      "Don't shoot!",
+                                      "Dont shoot"));
+    
+    YAZ_CHECK(test_icu_transliterator("[:Control:] Any-Remove",
+                                      "Don't\n shoot!",
+                                      "Don't shoot!"));
+
+    YAZ_CHECK(test_icu_transliterator("[:Decimal_Number:] Any-Remove",
+                                      "This is 4 you!",
+                                      "This is  you!"));
+
+    YAZ_CHECK(test_icu_transliterator("Lower; [:^Letter:] Remove",
+                                      "Don't shoot!",
+                                      "dontshoot"));
+    
+    YAZ_CHECK(test_icu_transliterator("[:^Number:] Remove",
+                                      "Monday 15th of April",
+                                      "15"));
+
+    YAZ_CHECK(test_icu_transliterator("Lower;"
+                                      "[[:WhiteSpace:][:Punctuation:]] Remove",
+                                      " word4you? ",
+                                      "word4you"));
+
+
+    YAZ_CHECK(test_icu_transliterator("NFD; [:Nonspacing Mark:] Remove; NFC",
+                                      "à côté de l'alcôve ovoïde",
+                                      "a cote de l'alcove ovoide"));
+
 }
 
+
 // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8
 
 int test_icu_tokenizer(const char * locale, char action,