From 4b646f979589d12e6ce2b009d95321a5e7caa815 Mon Sep 17 00:00:00 2001 From: Marc Cromme Date: Mon, 14 May 2007 13:51:24 +0000 Subject: [PATCH] ICU chain of normalizers and tokenizers half-way implemented --- src/icu_I18N.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++- src/icu_I18N.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 140 insertions(+), 9 deletions(-) diff --git a/src/icu_I18N.c b/src/icu_I18N.c index 0508719..39c8716 100644 --- a/src/icu_I18N.c +++ b/src/icu_I18N.c @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.c,v 1.11 2007-05-11 10:38:42 marc Exp $ +/* $Id: icu_I18N.c,v 1.12 2007-05-14 13:51:24 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -712,6 +712,77 @@ int icu_normalizer_normalize(struct icu_normalizer * normalizer, +struct icu_chain * icu_chain_create(const uint8_t * identifier, + const uint8_t * locale) +{ + + struct icu_chain * chain + = (struct icu_chain *) malloc(sizeof(struct icu_chain)); + + strncpy((char *) chain->identifier, (const char *) identifier, 128); + chain->identifier[128 - 1] = '\0'; + strncpy((char *) chain->locale, (const char *) locale, 16); + chain->locale[16 - 1] = '\0'; + + chain->token_count = 0; + + chain->display8 = icu_buf_utf8_create(0); + chain->norm8 = icu_buf_utf8_create(0); + chain->sort8 = icu_buf_utf8_create(0); + + chain->src16 = icu_buf_utf16_create(0); + + chain->steps = 0; + + return chain; +}; + +void icu_chain_destroy(struct icu_chain * chain) +{ + icu_buf_utf8_destroy(chain->display8); + icu_buf_utf8_destroy(chain->norm8); + icu_buf_utf8_destroy(chain->sort8); + + icu_buf_utf16_destroy(chain->src16); + + icu_chain_step_destroy(chain->steps); +}; + +struct icu_chain_step * icu_chain_append_step(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule) +{ + + struct icu_chain_step * step + = (struct icu_chain_step *) malloc(sizeof(struct icu_chain_step)); + + + + return step; +}; + +void icu_chain_step_destroy(struct icu_chain_step * step){ + + if (!step) + return; + + if (step->next) + icu_chain_step_destroy(step->next); + + // destroy last living icu_chain-step + switch(step->type) { + case ICU_chain_step_type_normalize: + icu_normalizer_destroy(step->u.normalizer); + break; + case ICU_chain_step_type_tokenize: + icu_tokenizer_destroy(step->u.tokenizer); + break; + default: + break; + } +}; + + #endif // HAVE_ICU diff --git a/src/icu_I18N.h b/src/icu_I18N.h index 74adfc4..2746f07 100644 --- a/src/icu_I18N.h +++ b/src/icu_I18N.h @@ -1,4 +1,4 @@ -/* $Id: icu_I18N.h,v 1.11 2007-05-11 10:38:42 marc Exp $ +/* $Id: icu_I18N.h,v 1.12 2007-05-14 13:51:24 marc Exp $ Copyright (c) 2006-2007, Index Data. This file is part of Pazpar2. @@ -41,15 +41,8 @@ -// forward declarations -//struct UBreakIterator; - - - - // declared structs and functions - int icu_check_status (UErrorCode status); struct icu_buf_utf16 @@ -162,6 +155,73 @@ int icu_normalizer_normalize(struct icu_normalizer * normalizer, UErrorCode *status); +#if 0 +struct icu_token +{ + int32_t token_id; + uint8_t * display8; + uint8_t * norm8; + uint8_t * sort8; +} +#endif + +enum icu_chain_step_type { + ICU_chain_step_type_none, // + ICU_chain_step_type_display, // convert to utf8 display format + ICU_chain_step_type_norm, // convert to utf8 norm format + ICU_chain_step_type_sort, // convert to utf8 sort format + ICU_chain_step_type_charmap, // apply utf16 charmap + ICU_chain_step_type_normalize, // apply utf16 normalization + ICU_chain_step_type_tokenize // apply utf16 tokenization +}; + + + +struct icu_chain_step +{ + // type and action object + enum icu_chain_step_type type; + union { + struct icu_normalizer * normalizer; + struct icu_tokenizer * tokenizer; + } u; + // temprary post-action utf16 buffer + struct icu_buf_utf16 * buf16; + struct icu_chain_step * next; +}; + + +struct icu_chain +{ + uint8_t identifier[128]; + uint8_t locale[16]; + + // number of tokens returned so far + int32_t token_count; + + // utf8 output buffers + struct icu_buf_utf8 * display8; + struct icu_buf_utf8 * norm8; + struct icu_buf_utf8 * sort8; + + // utf16 source buffer + struct icu_buf_utf16 * src16; + + // linked list of chain steps + struct icu_chain_step * steps; +}; + +struct icu_chain * icu_chain_create(const uint8_t * identifier, + const uint8_t * locale); + +void icu_chain_destroy(struct icu_chain * chain); + +struct icu_chain_step * icu_chain_append_step(struct icu_chain * chain, + enum icu_chain_step_type type, + const uint8_t * rule); + +void icu_chain_step_destroy(struct icu_chain_step * step); + #endif // HAVE_ICU -- 1.7.10.4