Tokenize for index_type system.

author Adam Dickmeiss <adam@indexdata.dk>

Thu, 25 Oct 2007 19:25:00 +0000 (19:25 +0000)

committer Adam Dickmeiss <adam@indexdata.dk>

Thu, 25 Oct 2007 19:25:00 +0000 (19:25 +0000)
author Adam Dickmeiss <adam@indexdata.dk>
Thu, 25 Oct 2007 19:25:00 +0000 (19:25 +0000)
committer Adam Dickmeiss <adam@indexdata.dk>
Thu, 25 Oct 2007 19:25:00 +0000 (19:25 +0000)
diff --git a/include/index_types.h b/include/index_types.h

index 389b316..b930c61 100644 (file)
--- a/include/index_types.h
+++ b/include/index_types.h
@@ -1,4 +1,4 @@
-/* $Id: index_types.h,v 1.1 2007-10-25 09:22:36 adam Exp $
+/* $Id: index_types.h,v 1.2 2007-10-25 19:25:00 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -21,7 +21,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
  
  /** 
-    \files
+    \file
      \brief Definitions for Zebra's index types
  */
  
@@ -33,12 +33,13 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  
  YAZ_BEGIN_CDECL
  
-/**
-   \brief zebra index rules handle (ptr)
-*/
+/** \brief zebra index types handle (ptr) */
  typedef struct zebra_index_types_s *zebra_index_types_t;
  
-/** \brief creates index rules handler/object from file
+/** \brief zebra index type handle (ptr) */
+typedef struct zebra_index_type_s *zebra_index_type_t;
+
+/** \brief creates index types handler/object from file
      \param fname filename
      \returns handle (NULL if unsuccessful)
  
@@ -63,9 +64,9 @@ typedef struct zebra_index_types_s *zebra_index_types_t;
  zebra_index_types_t zebra_index_types_create(const char *fname);
  
  /** \brief destroys index rules object
-    \param r handle
+    \param types handle
   */
-void zebra_index_types_destroy(zebra_index_types_t r);
+void zebra_index_types_destroy(zebra_index_types_t types);
  
  
  /** \brief creates index types handler/object from xml Doc
@@ -78,11 +79,56 @@ zebra_index_types_t zebra_index_types_create_doc(xmlDocPtr doc);
  
  
  /** \brief lookup of index type
-    \param r rules
+    \param types types
      \param id id to search for
      \returns pattern ID
  */
-const char *zebra_index_type_lookup_str(zebra_index_types_t r, const char *id);
+const char *zebra_index_type_lookup_str(zebra_index_types_t types, 
+                                        const char *id);
+
+
+/** \brief get index type of a given ID
+    \param types types
+    \param id ID to search for
+    \returns index type handle
+*/
+zebra_index_type_t zebra_index_type_get(zebra_index_types_t types, 
+                                        const char *id);
+
+/** \brief check whether index type is of type 'index'
+    \param type index type
+    \retval 1 YES
+    \retval 0 NO
+*/
+int zebra_index_type_is_index(zebra_index_type_t type);
+
+/** \brief check whether index type is of type 'sort'
+    \param type index type
+    \retval 1 YES
+    \retval 0 NO
+*/
+int zebra_index_type_is_sort(zebra_index_type_t type);
+
+/** \brief check whether index type is of type 'staticrank'
+    \param type index type
+    \retval 1 YES
+    \retval 0 NO
+*/
+int zebra_index_type_is_staticrank(zebra_index_type_t type);
+
+
+/** \brief tokenize a term for an index type
+    \param type index type
+    \param buf term buffer (pass 0 to continue with previous buf)
+    \param len term length
+    \param result_buf resulting token buffer
+    \param result_len resulting token length
+    \retval 1 token read and result is in result_buf
+    \retval 0 no token read (no more tokens in buf)
+*/
+int zebra_index_type_tokenize(zebra_index_type_t type,
+                              const char *buf, size_t len,
+                              const char **result_buf, size_t *result_len);
  
  YAZ_END_CDECL
  
diff --git a/util/index_types.c b/util/index_types.c

index edd4024..b8c4f9f 100644 (file)
--- a/util/index_types.c
+++ b/util/index_types.c
@@ -1,4 +1,4 @@
-/* $Id: index_types.c,v 1.1 2007-10-25 09:22:36 adam Exp $
+/* $Id: index_types.c,v 1.2 2007-10-25 19:25:00 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -20,12 +20,18 @@
     02111-1307, USA.
  */
  
+/** 
+    \file
+    \brief Implementation of Zebra's index types system
+*/
+
  #include <assert.h>
  #include <stdlib.h>
  #include <ctype.h>
  #include <string.h>
  
  #include "index_types.h"
+#include <yaz/icu_I18N.h>
  #include <yaz/match_glob.h>
  #include <yaz/xmalloc.h>
  #include <yaz/wrbuf.h>
@@ -33,37 +39,54 @@
  
  struct zebra_index_types_s {
  #if YAZ_HAVE_XML2
-    struct zebra_index_type *rules;
+    zebra_index_type_t rules;
      xmlDocPtr doc;
  #endif
  };
  
  #if YAZ_HAVE_XML2
-struct zebra_index_type {
+struct zebra_index_type_s {
      const xmlNode *ptr;
      const char *id;
      const char *locale;
      const char *position;
      const char *alwaysmatches;
      const char *firstinfield;
-    const char *sort;
-    struct zebra_index_type *next;
+    int sort_flag;
+    int index_flag;
+    int staticrank_flag;
+    int simple_chain;
+#if HAVE_ICU
+    struct icu_chain *chain;
+#endif
+    zebra_index_type_t next;
+    WRBUF simple_buf;
+    size_t simple_off;
  };
  
-struct zebra_index_type *parse_index_type(const xmlNode *ptr)
+static void index_type_destroy(zebra_index_type_t t);
+
+zebra_index_type_t parse_index_type(const xmlNode *ptr)
  {
      struct _xmlAttr *attr;
-    struct zebra_index_type *rule;
+    struct zebra_index_type_s *rule;
      
      rule = xmalloc(sizeof(*rule)); 
      rule->next = 0;
+#if HAVE_ICU
+    rule->chain = 0;
+#endif
      rule->ptr = ptr;
      rule->locale = 0;
      rule->id = 0;
      rule->position = 0;
      rule->alwaysmatches = 0;
      rule->firstinfield = 0;
-    rule->sort = 0;
+    rule->sort_flag = 0;
+    rule->index_flag = 1;
+    rule->staticrank_flag = 0;
+    rule->simple_chain = 0;
+    rule->simple_buf = wrbuf_alloc();
      for (attr = ptr->properties; attr; attr = attr->next)
      {
          if (attr->children && attr->children->type == XML_TEXT_NODE)
@@ -78,17 +101,71 @@ struct zebra_index_type *parse_index_type(const xmlNode *ptr)
                  rule->alwaysmatches = (const char *) attr->children->content;
              else if (!strcmp((const char *) attr->name, "firstinfield"))
                  rule->firstinfield = (const char *) attr->children->content;
+            else if (!strcmp((const char *) attr->name, "index"))
+            {
+                const char *v = (const char *) attr->children->content;
+                if (v)
+                    rule->index_flag = *v == '1';
+            }
              else if (!strcmp((const char *) attr->name, "sort"))
-                rule->sort = (const char *) attr->children->content;
+            {
+                const char *v = (const char *) attr->children->content;
+                if (v)
+                    rule->sort_flag = *v == '1';
+            }
+            else if (!strcmp((const char *) attr->name, "staticrank"))
+            {
+                const char *v = (const char *) attr->children->content;
+                if (v)
+                    rule->staticrank_flag = *v == '1';
+            }
              else
              {
-                yaz_log(YLOG_WARN, "Unsupport attribute '%s' for indexrule",
+                yaz_log(YLOG_WARN, "Unsupport attribute '%s' for indextype",
                          attr->name);
-                xfree(rule);
+                index_type_destroy(rule);
                  return 0;
              }
          }
      }
+    ptr = ptr->children;
+    while (ptr && ptr->type != XML_ELEMENT_NODE)
+        ptr = ptr->next;
+    if (!ptr)
+    {
+        yaz_log(YLOG_WARN, "Missing rules for indexrule");
+        index_type_destroy(rule);
+        rule = 0;
+    }
+    else if (!strcmp((const char *) ptr->name, "icu_chain"))
+    {
+#if HAVE_ICU
+        UErrorCode status;
+        rule->chain = icu_chain_xml_config(ptr,
+                                           rule->locale,
+                                           rule->sort_flag,
+                                           &status);
+        if (!rule->chain)
+        {
+            index_type_destroy(rule);
+            rule = 0;
+        }
+#else
+        yaz_log(YLOG_WARN, "ICU unsupported (must be part of YAZ)");
+        xfree(rule);
+        rule = 0;
+#endif
+    }
+    else if (!strcmp((const char *) ptr->name, "simple"))
+    {
+        rule->simple_chain = 1;
+    }
+    else 
+    {
+        yaz_log(YLOG_WARN, "Unsupported mapping %s for indexrule",  ptr->name);
+        index_type_destroy(rule);
+        rule = 0;
+    }
      return rule;
  }
  /* YAZ_HAVE_XML2 */
@@ -106,7 +183,7 @@ zebra_index_types_t zebra_index_types_create_doc(xmlDocPtr doc)
  {
  #if YAZ_HAVE_XML2
      zebra_index_types_t r = xmalloc(sizeof(*r));
-    struct zebra_index_type **rp = &r->rules;
+    zebra_index_type_t *rp = &r->rules;
      const xmlNode *top = xmlDocGetRootElement(doc);
      
      r->doc = doc;
@@ -137,43 +214,128 @@ zebra_index_types_t zebra_index_types_create_doc(xmlDocPtr doc)
      }
      return r;
  #else
-    yaz_log(YLOG_WARN, "Cannot read index types %s because YAZ is without XML "
-            "support", fname);
+    yaz_log(YLOG_WARN, "XML unsupported. Cannot read index rules");
      return 0;
  /* YAZ_HAVE_XML2 */
  #endif
  }
  
-void zebra_index_types_destroy(zebra_index_types_t r)
+static void index_type_destroy(zebra_index_type_t t)
  {
-#if YAZ_HAVE_XML2
-    struct zebra_index_type *rule;
-    while (r->rules)
+    if (t)
      {
-        rule = r->rules;
-        r->rules = rule->next;
-        xfree(rule);
+#if HAVE_ICU
+        if (t->chain)
+            icu_chain_destroy(t->chain);
+#endif
+        wrbuf_destroy(t->simple_buf);
+        xfree(t);
      }
-    xmlFreeDoc(r->doc);
+}
  
+void zebra_index_types_destroy(zebra_index_types_t r)
+{
+    if (r)
+    {
+#if YAZ_HAVE_XML2
+        zebra_index_type_t rule;
+        while (r->rules)
+        {
+            rule = r->rules;
+            r->rules = rule->next;
+            index_type_destroy(rule);
+        }
+        xmlFreeDoc(r->doc);
+        
  #endif
-    xfree(r);
+        xfree(r);
+    }
  }
  
-const char *zebra_index_type_lookup_str(zebra_index_types_t r, const char *id)
+zebra_index_type_t zebra_index_type_get(zebra_index_types_t types, 
+                                        const char *id)
  {
  #if YAZ_HAVE_XML2
-
-    struct zebra_index_type *rule = r->rules;
+    zebra_index_type_t rule = types->rules;
          
      while (rule && !yaz_match_glob(rule->id, id))
          rule = rule->next;
-    if (rule)
-        return rule->id;
+    return rule;
  #endif
      return 0;
  }
  
+const char *zebra_index_type_lookup_str(zebra_index_types_t types,
+                                        const char *id)
+{
+    zebra_index_type_t t = zebra_index_type_get(types, id);
+    if (t)
+        return t->id;
+    return 0;
+}
+
+int zebra_index_type_is_index(zebra_index_type_t type)
+{
+    return type->index_flag;
+}
+
+int zebra_index_type_is_sort(zebra_index_type_t type)
+{
+    return type->sort_flag;
+}
+
+int zebra_index_type_is_staticrank(zebra_index_type_t type)
+{
+    return type->staticrank_flag;
+}
+
+#define SE_CHARS ";,.()-/?<> \r\n\t"
+
+int tokenize_simple(zebra_index_type_t type,
+                    const char **result_buf, size_t *result_len)
+{
+    char *buf = wrbuf_buf(type->simple_buf);
+    size_t len = wrbuf_len(type->simple_buf);
+    size_t i = type->simple_off;
+    size_t start;
+
+    while (i < len && strchr(SE_CHARS, buf[i]))
+        i++;
+    start = i;
+    while (i < len && !strchr(SE_CHARS, buf[i]))
+    {
+        if (buf[i] > 32 && buf[i] < 127)
+            buf[i] = tolower(buf[i]);
+        i++;
+    }
+
+    type->simple_off = i;
+    if (start != i)
+    {
+        *result_buf = buf + start;
+        *result_len = i - start;
+        return 1;
+    }
+    return 0;
+ }
+
+int zebra_index_type_tokenize(zebra_index_type_t type,
+                              const char *buf, size_t len,
+                              const char **result_buf, size_t *result_len)
+{
+    if (type->simple_chain)
+    {
+        if (buf)
+        {
+            wrbuf_rewind(type->simple_buf);
+            wrbuf_write(type->simple_buf, buf, len);
+            type->simple_off = 0;
+        }
+        return tokenize_simple(type, result_buf, result_len);
+    }
+    return 0;
+}
+
  /*
   * Local variables:
   * c-basic-offset: 4
diff --git a/util/tst_index_types.c b/util/tst_index_types.c

index e5e3310..2e0c0ac 100644 (file)
--- a/util/tst_index_types.c
+++ b/util/tst_index_types.c
@@ -1,4 +1,4 @@
-/* $Id: tst_index_types.c,v 1.2 2007-10-25 09:23:34 adam Exp $
+/* $Id: tst_index_types.c,v 1.3 2007-10-25 19:25:00 adam Exp $
     Copyright (C) 1995-2007
     Index Data ApS
  
@@ -29,19 +29,19 @@ const char *xml_str =
  "    <indextypes>"
  "      <indextype id=\"*:w:el\" position=\"1\" alwaysmatches=\"1\" firstinfield=\"1\"\n"
  "       locale=\"el\">\n"
-"        <!-- conversion rules for words -->\n"
+"        <simple/>\n"
  "      </indextype>\n"
  "      <indextype id=\"*:w\" position=\"1\" alwaysmatches=\"1\" firstinfield=\"1\"\n"
  "       locale=\"en\">\n"
-"        <!-- conversion rules for words -->\n"
+"        <simple/>\n"
  "      </indextype>\n"
  "      <indextype id=\"*:p\" position=\"0\" alwaysmatches=\"0\" firstinfield=\"0\"\n"
  "        locale=\"en\">\n"
-"        <!-- conversion rules for phrase -->\n"
+"        <simple/>\n"
  "      </indextype>\n"
  "      <indextype id=\"*:s\" sort=\"1\" \n"
  "        locale=\"en\">\n"
-"        <!-- conversion rules for phrase -->\n"
+"        <simple/>\n"
  "      </indextype>\n"
  "    </indextypes>\n"
  ;
@@ -60,12 +60,14 @@ int compare_lookup(zebra_index_types_t r, const char *id,
  
  void tst1(void)
  {
+#if YAZ_HAVE_XML2
      xmlDocPtr doc = xmlParseMemory(xml_str, strlen(xml_str));
  
      YAZ_CHECK(doc);
      if (doc)
      {
          zebra_index_types_t rules = zebra_index_types_create_doc(doc);
+        zebra_index_type_t type;
          YAZ_CHECK(rules);
  
          if (!rules)
@@ -88,8 +90,62 @@ void tst1(void)
              }
          }
  
+        type = zebra_index_type_get(rules, "any:w");
+        YAZ_CHECK(type);
+        if (type)
+        {
+            const char *buf = " How are you?";
+            size_t len = strlen(buf);
+            int r = 1;
+
+            if (r)
+            {
+                const char *result_buf = 0;
+                size_t result_len = 0;
+                r = zebra_index_type_tokenize(type, buf, len,
+                                          &result_buf, &result_len);
+                YAZ_CHECK_EQ(r, 1);
+                YAZ_CHECK(result_len == 3 &&
+                          !memcmp(result_buf, "how", result_len));
+            }
+
+            if (r)
+            {
+                const char *result_buf = 0;
+                size_t result_len = 0;
+                r = zebra_index_type_tokenize(type, 0,  0,
+                                              &result_buf, &result_len);
+                YAZ_CHECK_EQ(r, 1);
+                YAZ_CHECK(result_len == 3 && 
+                          !memcmp(result_buf, "are", result_len));
+            }            
+
+            if (r)
+            {
+                const char *result_buf = 0;
+                size_t result_len = 0;
+                r = zebra_index_type_tokenize(type, 0,  0,
+                                              &result_buf, &result_len);
+                YAZ_CHECK_EQ(r, 1);
+                YAZ_CHECK(result_len == 3 && 
+                          !memcmp(result_buf, "you", result_len));
+            }            
+
+            if (r)
+            {
+                const char *result_buf = 0;
+                size_t result_len = 0;
+                r = zebra_index_type_tokenize(type, 0,  0,
+                                              &result_buf, &result_len);
+                YAZ_CHECK_EQ(r, 0);
+            }            
+        }
          zebra_index_types_destroy(rules);
      }
+#else
+    zebra_index_types_t rules = zebra_index_types_create_doc(doc);
+    YAZ_CHECK(!rules);
+#endif
  }
  
  int main(int argc, char **argv)
author	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 25 Oct 2007 19:25:00 +0000 (19:25 +0000)
committer	Adam Dickmeiss <adam@indexdata.dk>
	Thu, 25 Oct 2007 19:25:00 +0000 (19:25 +0000)
include/index_types.h		patch \| blob \| history
util/index_types.c		patch \| blob \| history
util/tst_index_types.c		patch \| blob \| history