From: Marc Cromme Date: Fri, 11 May 2007 22:23:33 +0000 (+0000) Subject: checked in very nice ICU normalization examples X-Git-Tag: PAZPAR2.1.0.0~153 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=a769527a0ad99891a945f8d34ecc89ac287f8505;p=pazpar2-moved-to-github.git checked in very nice ICU normalization examples --- diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index 5b707ac..13ed02e 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.16 2007-05-11 08:27:29 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.17 2007-05-11 22:23:33 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -314,33 +314,15 @@ void test_icu_I18N_sortmap(int argc, char **argv) // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 -void test_icu_I18N_transliterator(int argc, char **argv) +int test_icu_transliterator(const char * rules8cstr, + const char * src8cstr, + const char * chk8cstr) { + int success = 0; UErrorCode status = U_ZERO_ERROR; UParseError parse_error[256]; - const char * rules8cstr = "[:Punctuation:] Any-Remove"; - //const char * rules8cstr = "[:Control:] Any-Remove"; - //const char * rules8cstr = "[:Decimal_Number:] Any-Remove\n"; - //const char * rules8cstr = "[:Final_Punctuation:] Any-Remove"; - //const char * rules8cstr = "Lower; [:^Letter:] Remove"; - //const char * rules8cstr = "[:^Number:] Remove"; - //const char * rules8cstr = "Lower;[[:WhiteSpace:][:Punctuation:]] Remove"; - //const char * rules8cstr = "NFD; [:Nonspacing Mark:] Remove; NFC"; - - - - - const char * src8cstr = "Genesis 1\n" - "The Beginning\n" - "1 In the beginning God created the heavens and the earth.\n" - "2 Now the earth was formless and empty, darkness was over " - "the surface of the deep, and the Spirit of God was hovering " - "over the waters.\n" - "3 And God said, Let there be light, and there was light.\n"; - - struct icu_buf_utf16 * rules16 = icu_buf_utf16_create(0); struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); @@ -377,8 +359,19 @@ void test_icu_I18N_transliterator(int argc, char **argv) icu_utf16_to_utf8(dest8, src16, &status); icu_check_status(status); - printf("Transliterator:\n%s\n", dest8->utf8); + if(!strcmp((const char *) dest8->utf8, + (const char *) chk8cstr)) + success = 1; + else { + success = 0; + printf("Normaliozation;"); + printf("Rules: '%s'\n", rules8cstr); + printf("Input: '%s'\n", src8cstr); + printf("Normalized: '%s'\n", dest8->utf8); + printf("Expected: '%s'\n", chk8cstr); + } + utrans_close (trans); icu_buf_utf16_destroy(rules16); @@ -386,21 +379,10 @@ void test_icu_I18N_transliterator(int argc, char **argv) icu_buf_utf16_destroy(dest16); icu_buf_utf8_destroy(dest8); + return success; +} #if 0 - - icu_check_status(status); - - - int32_t ustr16_lim = *ustr16_len; - /* Transliterate a segment of a UChar* string */ - - utrans_transUChars (trans, ustr16, &*ustr16_len, - ustr16_cap, - 0, &ustr16_lim, &status); - - utrans_close (trans); - printf("\n\nUnicode Set Patterns:\n" " Pattern Description\n" " Ranges [a-z] The lower case letters a through z\n" @@ -445,8 +427,45 @@ void test_icu_I18N_transliterator(int argc, char **argv) ); #endif + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_transliterator(int argc, char **argv) +{ + + YAZ_CHECK(test_icu_transliterator("[:Punctuation:] Any-Remove", + "Don't shoot!", + "Dont shoot")); + + YAZ_CHECK(test_icu_transliterator("[:Control:] Any-Remove", + "Don't\n shoot!", + "Don't shoot!")); + + YAZ_CHECK(test_icu_transliterator("[:Decimal_Number:] Any-Remove", + "This is 4 you!", + "This is you!")); + + YAZ_CHECK(test_icu_transliterator("Lower; [:^Letter:] Remove", + "Don't shoot!", + "dontshoot")); + + YAZ_CHECK(test_icu_transliterator("[:^Number:] Remove", + "Monday 15th of April", + "15")); + + YAZ_CHECK(test_icu_transliterator("Lower;" + "[[:WhiteSpace:][:Punctuation:]] Remove", + " word4you? ", + "word4you")); + + + YAZ_CHECK(test_icu_transliterator("NFD; [:Nonspacing Mark:] Remove; NFC", + "à côté de l'alcôve ovoïde", + "a cote de l'alcove ovoide")); + } + // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 int test_icu_tokenizer(const char * locale, char action,