From: Marc Cromme Date: Wed, 9 May 2007 14:01:21 +0000 (+0000) Subject: ICU tokenizer works now X-Git-Tag: PAZPAR2.1.0.0~166 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=0dc3f6fe0bca19b271b3f34263e216ad9aba8d0a;p=pazpar2-moved-to-github.git ICU tokenizer works now --- diff --git a/src/icu_I18N.c b/src/icu_I18N.c index b7ba91d..fa9bd82 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.7 2007-05-07 12:52:04 marc Exp $ +/* $Id: icu_I18N.c,v 1.8 2007-05-09 14:01:21 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -55,11 +55,13 @@ int icu_check_status (UErrorCode status) { - //if(U_FAILURE(status)) - if(!U_SUCCESS(status)) + if(U_FAILURE(status)){ yaz_log(YLOG_WARN, "ICU: %d %s\n", status, u_errorName(status)); - return status; + return 0; + } + return 1; + } @@ -151,7 +153,8 @@ struct icu_buf_utf8 * icu_buf_utf8_resize(struct icu_buf_utf8 * buf8, buf8->utf8 = (uint8_t *) malloc(sizeof(uint8_t) * capacity); else buf8->utf8 - = (uint8_t *) realloc(buf8->utf8, sizeof(uint8_t) * capacity); + = (uint8_t *) realloc(buf8->utf8, + sizeof(uint8_t) * capacity); buf8->utf8[0] = (uint8_t) 0; buf8->utf8_len = 0; buf8->utf8_cap = capacity; @@ -405,6 +408,175 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll, +struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, + UErrorCode *status) +{ + struct icu_tokenizer * tokenizer + = (struct icu_tokenizer *) malloc(sizeof(struct icu_tokenizer)); + + strcpy(tokenizer->locale, locale); + tokenizer->action = action; + tokenizer->bi = 0; + tokenizer->buf16 = 0; + tokenizer->token_id = 0; + tokenizer->token_start = 0; + tokenizer->token_end = 0; + + + switch(tokenizer->action) { + case 'l': + tokenizer->bi + = ubrk_open(UBRK_LINE, tokenizer->locale, + 0, 0, status); + break; + case 's': + tokenizer->bi + = ubrk_open(UBRK_SENTENCE, tokenizer->locale, + 0, 0, status); + break; + case 'w': + tokenizer->bi + = ubrk_open(UBRK_WORD, tokenizer->locale, + 0, 0, status); + break; + case 'c': + tokenizer->bi + = ubrk_open(UBRK_CHARACTER, tokenizer->locale, + 0, 0, status); + break; + case 't': + tokenizer->bi + = ubrk_open(UBRK_TITLE, tokenizer->locale, + 0, 0, status); + break; + default: + *status = U_UNSUPPORTED_ERROR; + return 0; + break; + } + + // ICU error stuff is a very funny business + if (U_SUCCESS(*status)) + return tokenizer; + + // reestablishing zero error state + //if (*status == U_USING_DEFAULT_WARNING) + // *status = U_ZERO_ERROR; + + + // freeing if failed + free(tokenizer); + return 0; +}; + +void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer) +{ + + if (tokenizer) { + if (tokenizer->bi) + ubrk_close(tokenizer->bi); + free(tokenizer); + } +}; + +int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * src16, + UErrorCode *status) +{ + if (!tokenizer || !tokenizer->bi || !src16) + return 0; + + tokenizer->buf16 = src16; + + ubrk_setText(tokenizer->bi, src16->utf16, src16->utf16_len, status); + + + if (U_FAILURE(*status)) + return 0; + + return 1; +}; + +int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * tkn16, + UErrorCode *status) +{ + int32_t tkn_start = 0; + int32_t tkn_end = 0; + + + if (!tokenizer || !tokenizer->bi + || !tokenizer->buf16 || !tokenizer->buf16->utf16_len) + return 0; + + // never change tokenizer->buf16 and keep always invariant + // 0 <= tokenizer->token_start + // <= tokenizer->token_end + // <= tokenizer->buf16->utf16_len + // returns length of token + + if (0 == tokenizer->token_end) // first call + tkn_start = ubrk_first(tokenizer->bi); + else //successive calls + tkn_start = tokenizer->token_end; + + // get next position + tkn_end = ubrk_next(tokenizer->bi); + + // repairing invariant at end of ubrk, which is UBRK_DONE = -1 + if (UBRK_DONE == tkn_end) + tkn_end = tokenizer->buf16->utf16_len; + + // copy out if everything is well + if(U_FAILURE(*status)) + return 0; + + tokenizer->token_id++; + tokenizer->token_start = tkn_start; + tokenizer->token_end = tkn_end; + + // copying into token buffer if it exists + if (tkn16){ + if (tkn16->utf16_cap < (tkn_end - tkn_start)) + icu_buf_utf16_resize(tkn16, (size_t) (tkn_end - tkn_start) * 2); + + u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start], + (tkn_end - tkn_start)); + + tkn16->utf16_len = (tkn_end - tkn_start); + } + + return (tokenizer->token_end - tokenizer->token_start); +} + + +int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_id; +}; + +int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_start; +}; + +int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_end; +}; + +int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer) +{ + return (tokenizer->token_end - tokenizer->token_start); +}; + +int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer) +{ + return tokenizer->token_count; +}; + + + #endif // HAVE_ICU diff --git a/src/icu_I18N.h b/src/icu_I18N.h index 803d89b..df6cd2d 100644 --- a/src/icu_I18N.h +++ b/src/icu_I18N.h @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.h,v 1.7 2007-05-07 12:52:04 marc Exp $ +/* $Id: icu_I18N.h,v 1.8 2007-05-09 14:01:21 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -35,10 +35,19 @@ //#include /* C Converter API */ //#include /* some more string fcns*/ //#include -//#include +#include //#include +// forward declarations +//struct UBreakIterator; + + + + +// declared structs and functions + + int icu_check_status (UErrorCode status); struct icu_buf_utf16 @@ -91,6 +100,44 @@ UErrorCode icu_sortkey8_from_utf16(UCollator *coll, struct icu_buf_utf16 * src16, UErrorCode * status); +struct icu_tokenizer +{ + char locale[16]; + char action; + UBreakIterator* bi; + struct icu_buf_utf16 * buf16; + int32_t token_count; + int32_t token_id; + int32_t token_start; + int32_t token_end; + // keep always invariant + // 0 <= token_start + // <= token_end + // <= buf16->utf16_len + // and invariant + // 0 <= token_id <= token_count +}; + +struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, + UErrorCode *status); + +void icu_tokenizer_destroy(struct icu_tokenizer * tokenizer); + +int icu_tokenizer_attach(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * src16, UErrorCode *status); + +int32_t icu_tokenizer_next_token(struct icu_tokenizer * tokenizer, + struct icu_buf_utf16 * tkn16, + UErrorCode *status); + +int32_t icu_tokenizer_token_id(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_start(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_end(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_length(struct icu_tokenizer * tokenizer); +int32_t icu_tokenizer_token_count(struct icu_tokenizer * tokenizer); + + + #endif // HAVE_ICU #endif // ICU_I18NL_H diff --git a/src/test_icu_I18N.c b/src/test_icu_I18N.c index c9d3e39..992922a 100644 --- a/src/test_icu_I18N.c +++ b/src/test_icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: test_icu_I18N.c,v 1.10 2007-05-07 12:52:04 marc Exp $ +/* $Id: test_icu_I18N.c,v 1.11 2007-05-09 14:01:21 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -209,7 +209,7 @@ int test_icu_sortmap(const char * locale, int src_list_len, UCollator *coll = ucol_open(locale, &status); icu_check_status(status); - if(!U_SUCCESS(status)) + if(U_FAILURE(status)) return 0; // assigning display terms and sort keys using buf 8 and buf16 @@ -312,6 +312,75 @@ void test_icu_I18N_sortmap(int argc, char **argv) } +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_normmap(int argc, char **argv) +{ + + +} + + +// DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 + +void test_icu_I18N_tokenizer(int argc, char **argv) +{ + + const char * src8cstr + = "Though I am not naturally honest, I am so sometimes by chance."; + + UErrorCode status = U_ZERO_ERROR; + struct icu_buf_utf16 * src16 = icu_buf_utf16_create(0); + struct icu_buf_utf16 * tkn16 = icu_buf_utf16_create(0); + struct icu_buf_utf8 * tkn8 = icu_buf_utf8_create(0); + + printf("Input: '%s'\n", src8cstr); + + // transforming to UTF16 + icu_utf16_from_utf8_cstr(src16, src8cstr, &status); + icu_check_status(status); + + // set up tokenizer + struct icu_tokenizer * tokenizer + = icu_tokenizer_create("en", 's', &status); + icu_check_status(status); + YAZ_CHECK(tokenizer); + + // attach text buffer to tokenizer + icu_tokenizer_attach(tokenizer, src16, &status); + icu_check_status(status); + YAZ_CHECK(tokenizer->bi); + + // perform work on tokens + printf("Tokens: "); + while(icu_tokenizer_next_token(tokenizer, tkn16, &status)){ + icu_check_status(status); + + // converting to UTF8 + icu_utf16_to_utf8(tkn8, tkn16, &status); + + printf("'%s' ", tkn8->utf8); + + //printf("token %d %d %d %d '%s'\n", + // icu_tokenizer_token_id(tokenizer), + // icu_tokenizer_token_start(tokenizer), + // icu_tokenizer_token_end(tokenizer), + // icu_tokenizer_token_length(tokenizer), + // tkn8->utf8); + } + printf(" (%d)(%d)\n", icu_tokenizer_token_id(tokenizer), + icu_tokenizer_token_count(tokenizer)); + + icu_tokenizer_destroy(tokenizer); + icu_buf_utf16_destroy(src16); + icu_buf_utf16_destroy(tkn16); + icu_buf_utf8_destroy(tkn8); +} + + + + + #endif // HAVE_ICU // DO NOT EDIT THIS FILE IF YOUR EDITOR DOES NOT SUPPORT UTF-8 @@ -327,7 +396,9 @@ int main(int argc, char **argv) //test_icu_I18N_casemap_failures(argc, argv); test_icu_I18N_casemap(argc, argv); test_icu_I18N_sortmap(argc, argv); - + test_icu_I18N_normmap(argc, argv); + test_icu_I18N_tokenizer(argc, argv); + #else // HAVE_ICU printf("ICU unit tests omitted.\n"