-/* $Id: index_types.h,v 1.1 2007-10-25 09:22:36 adam Exp $
+/* $Id: index_types.h,v 1.2 2007-10-25 19:25:00 adam Exp $
Copyright (C) 1995-2007
Index Data ApS
*/
/**
- \files
+ \file
\brief Definitions for Zebra's index types
*/
YAZ_BEGIN_CDECL
-/**
- \brief zebra index rules handle (ptr)
-*/
+/** \brief zebra index types handle (ptr) */
typedef struct zebra_index_types_s *zebra_index_types_t;
-/** \brief creates index rules handler/object from file
+/** \brief zebra index type handle (ptr) */
+typedef struct zebra_index_type_s *zebra_index_type_t;
+
+/** \brief creates index types handler/object from file
\param fname filename
\returns handle (NULL if unsuccessful)
zebra_index_types_t zebra_index_types_create(const char *fname);
/** \brief destroys index rules object
- \param r handle
+ \param types handle
*/
-void zebra_index_types_destroy(zebra_index_types_t r);
+void zebra_index_types_destroy(zebra_index_types_t types);
/** \brief creates index types handler/object from xml Doc
/** \brief lookup of index type
- \param r rules
+ \param types types
\param id id to search for
\returns pattern ID
*/
-const char *zebra_index_type_lookup_str(zebra_index_types_t r, const char *id);
+const char *zebra_index_type_lookup_str(zebra_index_types_t types,
+ const char *id);
+
+
+/** \brief get index type of a given ID
+ \param types types
+ \param id ID to search for
+ \returns index type handle
+*/
+zebra_index_type_t zebra_index_type_get(zebra_index_types_t types,
+ const char *id);
+
+/** \brief check whether index type is of type 'index'
+ \param type index type
+ \retval 1 YES
+ \retval 0 NO
+*/
+int zebra_index_type_is_index(zebra_index_type_t type);
+
+/** \brief check whether index type is of type 'sort'
+ \param type index type
+ \retval 1 YES
+ \retval 0 NO
+*/
+int zebra_index_type_is_sort(zebra_index_type_t type);
+
+/** \brief check whether index type is of type 'staticrank'
+ \param type index type
+ \retval 1 YES
+ \retval 0 NO
+*/
+int zebra_index_type_is_staticrank(zebra_index_type_t type);
+
+
+/** \brief tokenize a term for an index type
+ \param type index type
+ \param buf term buffer (pass 0 to continue with previous buf)
+ \param len term length
+ \param result_buf resulting token buffer
+ \param result_len resulting token length
+ \retval 1 token read and result is in result_buf
+ \retval 0 no token read (no more tokens in buf)
+*/
+int zebra_index_type_tokenize(zebra_index_type_t type,
+ const char *buf, size_t len,
+ const char **result_buf, size_t *result_len);
YAZ_END_CDECL
-/* $Id: index_types.c,v 1.1 2007-10-25 09:22:36 adam Exp $
+/* $Id: index_types.c,v 1.2 2007-10-25 19:25:00 adam Exp $
Copyright (C) 1995-2007
Index Data ApS
02111-1307, USA.
*/
+/**
+ \file
+ \brief Implementation of Zebra's index types system
+*/
+
#include <assert.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "index_types.h"
+#include <yaz/icu_I18N.h>
#include <yaz/match_glob.h>
#include <yaz/xmalloc.h>
#include <yaz/wrbuf.h>
struct zebra_index_types_s {
#if YAZ_HAVE_XML2
- struct zebra_index_type *rules;
+ zebra_index_type_t rules;
xmlDocPtr doc;
#endif
};
#if YAZ_HAVE_XML2
-struct zebra_index_type {
+struct zebra_index_type_s {
const xmlNode *ptr;
const char *id;
const char *locale;
const char *position;
const char *alwaysmatches;
const char *firstinfield;
- const char *sort;
- struct zebra_index_type *next;
+ int sort_flag;
+ int index_flag;
+ int staticrank_flag;
+ int simple_chain;
+#if HAVE_ICU
+ struct icu_chain *chain;
+#endif
+ zebra_index_type_t next;
+ WRBUF simple_buf;
+ size_t simple_off;
};
-struct zebra_index_type *parse_index_type(const xmlNode *ptr)
+static void index_type_destroy(zebra_index_type_t t);
+
+zebra_index_type_t parse_index_type(const xmlNode *ptr)
{
struct _xmlAttr *attr;
- struct zebra_index_type *rule;
+ struct zebra_index_type_s *rule;
rule = xmalloc(sizeof(*rule));
rule->next = 0;
+#if HAVE_ICU
+ rule->chain = 0;
+#endif
rule->ptr = ptr;
rule->locale = 0;
rule->id = 0;
rule->position = 0;
rule->alwaysmatches = 0;
rule->firstinfield = 0;
- rule->sort = 0;
+ rule->sort_flag = 0;
+ rule->index_flag = 1;
+ rule->staticrank_flag = 0;
+ rule->simple_chain = 0;
+ rule->simple_buf = wrbuf_alloc();
for (attr = ptr->properties; attr; attr = attr->next)
{
if (attr->children && attr->children->type == XML_TEXT_NODE)
rule->alwaysmatches = (const char *) attr->children->content;
else if (!strcmp((const char *) attr->name, "firstinfield"))
rule->firstinfield = (const char *) attr->children->content;
+ else if (!strcmp((const char *) attr->name, "index"))
+ {
+ const char *v = (const char *) attr->children->content;
+ if (v)
+ rule->index_flag = *v == '1';
+ }
else if (!strcmp((const char *) attr->name, "sort"))
- rule->sort = (const char *) attr->children->content;
+ {
+ const char *v = (const char *) attr->children->content;
+ if (v)
+ rule->sort_flag = *v == '1';
+ }
+ else if (!strcmp((const char *) attr->name, "staticrank"))
+ {
+ const char *v = (const char *) attr->children->content;
+ if (v)
+ rule->staticrank_flag = *v == '1';
+ }
else
{
- yaz_log(YLOG_WARN, "Unsupport attribute '%s' for indexrule",
+ yaz_log(YLOG_WARN, "Unsupport attribute '%s' for indextype",
attr->name);
- xfree(rule);
+ index_type_destroy(rule);
return 0;
}
}
}
+ ptr = ptr->children;
+ while (ptr && ptr->type != XML_ELEMENT_NODE)
+ ptr = ptr->next;
+ if (!ptr)
+ {
+ yaz_log(YLOG_WARN, "Missing rules for indexrule");
+ index_type_destroy(rule);
+ rule = 0;
+ }
+ else if (!strcmp((const char *) ptr->name, "icu_chain"))
+ {
+#if HAVE_ICU
+ UErrorCode status;
+ rule->chain = icu_chain_xml_config(ptr,
+ rule->locale,
+ rule->sort_flag,
+ &status);
+ if (!rule->chain)
+ {
+ index_type_destroy(rule);
+ rule = 0;
+ }
+#else
+ yaz_log(YLOG_WARN, "ICU unsupported (must be part of YAZ)");
+ xfree(rule);
+ rule = 0;
+#endif
+ }
+ else if (!strcmp((const char *) ptr->name, "simple"))
+ {
+ rule->simple_chain = 1;
+ }
+ else
+ {
+ yaz_log(YLOG_WARN, "Unsupported mapping %s for indexrule", ptr->name);
+ index_type_destroy(rule);
+ rule = 0;
+ }
return rule;
}
/* YAZ_HAVE_XML2 */
{
#if YAZ_HAVE_XML2
zebra_index_types_t r = xmalloc(sizeof(*r));
- struct zebra_index_type **rp = &r->rules;
+ zebra_index_type_t *rp = &r->rules;
const xmlNode *top = xmlDocGetRootElement(doc);
r->doc = doc;
}
return r;
#else
- yaz_log(YLOG_WARN, "Cannot read index types %s because YAZ is without XML "
- "support", fname);
+ yaz_log(YLOG_WARN, "XML unsupported. Cannot read index rules");
return 0;
/* YAZ_HAVE_XML2 */
#endif
}
-void zebra_index_types_destroy(zebra_index_types_t r)
+static void index_type_destroy(zebra_index_type_t t)
{
-#if YAZ_HAVE_XML2
- struct zebra_index_type *rule;
- while (r->rules)
+ if (t)
{
- rule = r->rules;
- r->rules = rule->next;
- xfree(rule);
+#if HAVE_ICU
+ if (t->chain)
+ icu_chain_destroy(t->chain);
+#endif
+ wrbuf_destroy(t->simple_buf);
+ xfree(t);
}
- xmlFreeDoc(r->doc);
+}
+void zebra_index_types_destroy(zebra_index_types_t r)
+{
+ if (r)
+ {
+#if YAZ_HAVE_XML2
+ zebra_index_type_t rule;
+ while (r->rules)
+ {
+ rule = r->rules;
+ r->rules = rule->next;
+ index_type_destroy(rule);
+ }
+ xmlFreeDoc(r->doc);
+
#endif
- xfree(r);
+ xfree(r);
+ }
}
-const char *zebra_index_type_lookup_str(zebra_index_types_t r, const char *id)
+zebra_index_type_t zebra_index_type_get(zebra_index_types_t types,
+ const char *id)
{
#if YAZ_HAVE_XML2
-
- struct zebra_index_type *rule = r->rules;
+ zebra_index_type_t rule = types->rules;
while (rule && !yaz_match_glob(rule->id, id))
rule = rule->next;
- if (rule)
- return rule->id;
+ return rule;
#endif
return 0;
}
+const char *zebra_index_type_lookup_str(zebra_index_types_t types,
+ const char *id)
+{
+ zebra_index_type_t t = zebra_index_type_get(types, id);
+ if (t)
+ return t->id;
+ return 0;
+}
+
+int zebra_index_type_is_index(zebra_index_type_t type)
+{
+ return type->index_flag;
+}
+
+int zebra_index_type_is_sort(zebra_index_type_t type)
+{
+ return type->sort_flag;
+}
+
+int zebra_index_type_is_staticrank(zebra_index_type_t type)
+{
+ return type->staticrank_flag;
+}
+
+#define SE_CHARS ";,.()-/?<> \r\n\t"
+
+int tokenize_simple(zebra_index_type_t type,
+ const char **result_buf, size_t *result_len)
+{
+ char *buf = wrbuf_buf(type->simple_buf);
+ size_t len = wrbuf_len(type->simple_buf);
+ size_t i = type->simple_off;
+ size_t start;
+
+ while (i < len && strchr(SE_CHARS, buf[i]))
+ i++;
+ start = i;
+ while (i < len && !strchr(SE_CHARS, buf[i]))
+ {
+ if (buf[i] > 32 && buf[i] < 127)
+ buf[i] = tolower(buf[i]);
+ i++;
+ }
+
+ type->simple_off = i;
+ if (start != i)
+ {
+ *result_buf = buf + start;
+ *result_len = i - start;
+ return 1;
+ }
+ return 0;
+ }
+
+int zebra_index_type_tokenize(zebra_index_type_t type,
+ const char *buf, size_t len,
+ const char **result_buf, size_t *result_len)
+{
+ if (type->simple_chain)
+ {
+ if (buf)
+ {
+ wrbuf_rewind(type->simple_buf);
+ wrbuf_write(type->simple_buf, buf, len);
+ type->simple_off = 0;
+ }
+ return tokenize_simple(type, result_buf, result_len);
+ }
+ return 0;
+}
+
/*
* Local variables:
* c-basic-offset: 4
-/* $Id: tst_index_types.c,v 1.2 2007-10-25 09:23:34 adam Exp $
+/* $Id: tst_index_types.c,v 1.3 2007-10-25 19:25:00 adam Exp $
Copyright (C) 1995-2007
Index Data ApS
" <indextypes>"
" <indextype id=\"*:w:el\" position=\"1\" alwaysmatches=\"1\" firstinfield=\"1\"\n"
" locale=\"el\">\n"
-" <!-- conversion rules for words -->\n"
+" <simple/>\n"
" </indextype>\n"
" <indextype id=\"*:w\" position=\"1\" alwaysmatches=\"1\" firstinfield=\"1\"\n"
" locale=\"en\">\n"
-" <!-- conversion rules for words -->\n"
+" <simple/>\n"
" </indextype>\n"
" <indextype id=\"*:p\" position=\"0\" alwaysmatches=\"0\" firstinfield=\"0\"\n"
" locale=\"en\">\n"
-" <!-- conversion rules for phrase -->\n"
+" <simple/>\n"
" </indextype>\n"
" <indextype id=\"*:s\" sort=\"1\" \n"
" locale=\"en\">\n"
-" <!-- conversion rules for phrase -->\n"
+" <simple/>\n"
" </indextype>\n"
" </indextypes>\n"
;
void tst1(void)
{
+#if YAZ_HAVE_XML2
xmlDocPtr doc = xmlParseMemory(xml_str, strlen(xml_str));
YAZ_CHECK(doc);
if (doc)
{
zebra_index_types_t rules = zebra_index_types_create_doc(doc);
+ zebra_index_type_t type;
YAZ_CHECK(rules);
if (!rules)
}
}
+ type = zebra_index_type_get(rules, "any:w");
+ YAZ_CHECK(type);
+ if (type)
+ {
+ const char *buf = " How are you?";
+ size_t len = strlen(buf);
+ int r = 1;
+
+ if (r)
+ {
+ const char *result_buf = 0;
+ size_t result_len = 0;
+ r = zebra_index_type_tokenize(type, buf, len,
+ &result_buf, &result_len);
+ YAZ_CHECK_EQ(r, 1);
+ YAZ_CHECK(result_len == 3 &&
+ !memcmp(result_buf, "how", result_len));
+ }
+
+ if (r)
+ {
+ const char *result_buf = 0;
+ size_t result_len = 0;
+ r = zebra_index_type_tokenize(type, 0, 0,
+ &result_buf, &result_len);
+ YAZ_CHECK_EQ(r, 1);
+ YAZ_CHECK(result_len == 3 &&
+ !memcmp(result_buf, "are", result_len));
+ }
+
+ if (r)
+ {
+ const char *result_buf = 0;
+ size_t result_len = 0;
+ r = zebra_index_type_tokenize(type, 0, 0,
+ &result_buf, &result_len);
+ YAZ_CHECK_EQ(r, 1);
+ YAZ_CHECK(result_len == 3 &&
+ !memcmp(result_buf, "you", result_len));
+ }
+
+ if (r)
+ {
+ const char *result_buf = 0;
+ size_t result_len = 0;
+ r = zebra_index_type_tokenize(type, 0, 0,
+ &result_buf, &result_len);
+ YAZ_CHECK_EQ(r, 0);
+ }
+ }
zebra_index_types_destroy(rules);
}
+#else
+ zebra_index_types_t rules = zebra_index_types_create_doc(doc);
+ YAZ_CHECK(!rules);
+#endif
}
int main(int argc, char **argv)