Use YAZ' new icu_iter_get_org_info for snippets
authorAdam Dickmeiss <adam@indexdata.dk>
Wed, 5 Jun 2013 13:45:25 +0000 (15:45 +0200)
committerAdam Dickmeiss <adam@indexdata.dk>
Wed, 5 Jun 2013 13:45:25 +0000 (15:45 +0200)
src/charsets.c
src/charsets.h
src/http_command.c
src/record.h
src/relevance.c
src/relevance.h
src/session.c

index f931445..9688628 100644 (file)
@@ -53,6 +53,8 @@ struct pp2_charset_s {
     const char *(*token_next_handler)(pp2_charset_token_t prt);
     const char *(*get_sort_handler)(pp2_charset_token_t prt);
     const char *(*get_display_handler)(pp2_charset_token_t prt);
+    void (*get_org_handler)(pp2_charset_token_t ptr,
+                            size_t *start, size_t *len);
 #if YAZ_HAVE_ICU
     struct icu_chain * icu_chn;
     UErrorCode icu_sts;
@@ -63,11 +65,15 @@ static const char *pp2_charset_token_null(pp2_charset_token_t prt);
 static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt);
 static const char *pp2_get_sort_ascii(pp2_charset_token_t prt);
 static const char *pp2_get_display_ascii(pp2_charset_token_t prt);
+static void pp2_get_org_ascii(pp2_charset_token_t prt,
+                              size_t *start, size_t *len);
 
 #if YAZ_HAVE_ICU
 static const char *pp2_charset_token_icu(pp2_charset_token_t prt);
 static const char *pp2_get_sort_icu(pp2_charset_token_t prt);
 static const char *pp2_get_display_icu(pp2_charset_token_t prt);
+static void pp2_get_org_icu(pp2_charset_token_t prt,
+                            size_t *start, size_t *len);
 #endif
 
 /* tokenzier handle */
@@ -80,6 +86,9 @@ struct pp2_charset_token_s {
 #if YAZ_HAVE_ICU
     yaz_icu_iter_t iter;
 #endif
+    const char *cp0;
+    size_t start;
+    size_t len;
 };
 
 struct pp2_charset_fact_s {
@@ -226,6 +235,7 @@ pp2_charset_t pp2_charset_create(void)
     pct->token_next_handler = pp2_charset_token_null;
     pct->get_sort_handler  = pp2_get_sort_ascii;
     pct->get_display_handler  = pp2_get_display_ascii;
+    pct->get_org_handler = pp2_get_org_ascii;
 #if YAZ_HAVE_ICU
     pct->icu_chn = 0;
 #endif // YAZ_HAVE_ICU
@@ -250,6 +260,7 @@ pp2_charset_t pp2_charset_create_icu(struct icu_chain *icu_chn)
         pct->token_next_handler = pp2_charset_token_icu;
         pct->get_sort_handler = pp2_get_sort_icu;
         pct->get_display_handler = pp2_get_display_icu;
+        pct->get_org_handler = pp2_get_org_icu;
     }
     return pct;
 }
@@ -290,6 +301,8 @@ pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct)
     if (pct->icu_chn)
         prt->iter = icu_iter_create(pct->icu_chn);
 #endif
+    prt->start = 0;
+    prt->len = 0;
     return prt;
 }
 
@@ -313,6 +326,7 @@ void pp2_charset_token_first(pp2_charset_token_t prt,
 
     wrbuf_rewind(prt->norm_str);
     wrbuf_rewind(prt->sort_str);
+    prt->cp0 = buf;
     prt->cp = buf;
     prt->last_cp = 0;
 
@@ -354,6 +368,12 @@ const char *pp2_get_display(pp2_charset_token_t prt)
     return prt->pct->get_display_handler(prt);
 }
 
+void pp2_get_org(pp2_charset_token_t prt, size_t *start, size_t *len)
+{
+    prt->pct->get_org_handler(prt, start, len);
+}
+
+
 #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) : -1)
 /* original tokenizer with our tokenize interface, but we
    add +1 to ensure no '\0' are in our string (except for EOF)
@@ -363,6 +383,7 @@ static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt)
     const char *cp = prt->cp;
     int c;
 
+    prt->start = cp - prt->cp0;
     /* skip white space */
     while (*cp && (c = raw_char(tolower(*(const unsigned char *)cp))) < 0)
         cp++;
@@ -381,6 +402,7 @@ static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt)
         wrbuf_putc(prt->norm_str, c);
         cp++;
     }
+    prt->len = (cp - prt->cp0) - prt->start;
     prt->cp = cp;
     return wrbuf_cstr(prt->norm_str);
 }
@@ -412,6 +434,13 @@ static const char *pp2_get_display_ascii(pp2_charset_token_t prt)
     }
 }
 
+static void pp2_get_org_ascii(pp2_charset_token_t prt,
+                              size_t *start, size_t *len)
+{
+    *start = prt->start;
+    *len = prt->len;
+}
+
 static const char *pp2_charset_token_null(pp2_charset_token_t prt)
 {
     const char *cp = prt->cp;
@@ -420,6 +449,7 @@ static const char *pp2_charset_token_null(pp2_charset_token_t prt)
     while (*cp)
         cp++;
     prt->cp = cp;
+    prt->len = cp - prt->cp0;
     return prt->last_cp;
 }
 
@@ -443,6 +473,11 @@ static const char *pp2_get_display_icu(pp2_charset_token_t prt)
     return icu_iter_get_display(prt->iter);
 }
 
+static void pp2_get_org_icu(pp2_charset_token_t prt, size_t *start, size_t *len)
+{
+    icu_iter_get_org_info(prt->iter, start, len);
+}
+
 #endif // YAZ_HAVE_ICU
 
 
index b203ce4..3b8325b 100644 (file)
@@ -45,6 +45,7 @@ void pp2_charset_token_destroy(pp2_charset_token_t prt);
 const char *pp2_charset_token_next(pp2_charset_token_t prt);
 const char *pp2_get_sort(pp2_charset_token_t prt);
 const char *pp2_get_display(pp2_charset_token_t prt);
+void pp2_get_org(pp2_charset_token_t prt, size_t *start, size_t *len);
 
 #endif
 
index 56e1f63..30de3d8 100644 (file)
@@ -872,7 +872,8 @@ static void cmd_bytarget(struct http_channel *c)
 }
 
 static void write_metadata(WRBUF w, struct conf_service *service,
-                           struct record_metadata **ml, int full, int indent)
+                           struct record_metadata **ml, unsigned flags,
+                           int indent)
 {
     int imeta;
 
@@ -880,7 +881,7 @@ static void write_metadata(WRBUF w, struct conf_service *service,
     {
         struct conf_metadata *cmd = &service->metadata[imeta];
         struct record_metadata *md;
-        if (!cmd->brief && !full)
+        if (!cmd->brief && !(flags & 1))
             continue;
         for (md = ml[imeta]; md; md = md->next)
         {
@@ -900,7 +901,10 @@ static void write_metadata(WRBUF w, struct conf_service *service,
             switch (cmd->type)
             {
                 case Metadata_type_generic:
-                    wrbuf_xmlputs(w, md->data.text.disp);
+                    if (md->data.text.snippet && (flags & 2))
+                        wrbuf_puts(w, md->data.text.snippet);
+                    else
+                        wrbuf_xmlputs(w, md->data.text.disp);
                     break;
                 case Metadata_type_year:
                     wrbuf_printf(w, "%d", md->data.number.min);
@@ -917,7 +921,8 @@ static void write_metadata(WRBUF w, struct conf_service *service,
 }
 
 static void write_subrecord(struct record *r, WRBUF w,
-        struct conf_service *service, int show_details)
+                            struct conf_service *service, unsigned flags,
+                            int indent)
 {
     const char *name = session_setting_oneval(
         client_get_database(r->client), PZ_NAME);
@@ -934,7 +939,7 @@ static void write_subrecord(struct record *r, WRBUF w,
     wrbuf_printf(w,  "%u", r->checksum);
     wrbuf_puts(w, "\">\n");
 
-    write_metadata(w, service, r->metadata, show_details, 2);
+    write_metadata(w, service, r->metadata, flags, indent);
     wrbuf_puts(w, " </location>\n");
 }
 
@@ -997,6 +1002,8 @@ static void show_record(struct http_channel *c, struct http_session *s)
     const char *offsetstr = http_argbyname(rq, "offset");
     const char *binarystr = http_argbyname(rq, "binary");
     const char *checksumstr = http_argbyname(rq, "checksum");
+    const char *snippets = http_argbyname(rq, "snippets");
+    unsigned flags = (snippets && *snippets == '1') ? 3 : 1;
 
     if (!s)
         return;
@@ -1090,9 +1097,9 @@ static void show_record(struct http_channel *c, struct http_session *s)
         }
         wrbuf_printf(c->wrbuf, " <activeclients>%d</activeclients>\n",
                      session_active_clients(s->psession));
-        write_metadata(c->wrbuf, service, rec->metadata, 1, 1);
+        write_metadata(c->wrbuf, service, rec->metadata, flags, 1);
         for (r = rec->records; r; r = r->next)
-            write_subrecord(r, c->wrbuf, service, 2);
+            write_subrecord(r, c->wrbuf, service, flags, 2);
         response_close(c, "record");
     }
     show_single_stop(s->psession, rec);
@@ -1133,6 +1140,8 @@ static void show_records(struct http_channel *c, struct http_session *s,
     const char *num = http_argbyname(rq, "num");
     const char *sort = http_argbyname(rq, "sort");
     int version = get_version(rq);
+    const char *snippets = http_argbyname(rq, "snippets");
+    unsigned flags = (snippets && *snippets == '1') ? 2 : 0;
 
     int startn = 0;
     int numn = 20;
@@ -1187,9 +1196,9 @@ static void show_records(struct http_channel *c, struct http_session *s,
         struct conf_service *service = s->psession->service;
 
         wrbuf_puts(c->wrbuf, "<hit>\n");
-        write_metadata(c->wrbuf, service, rec->metadata, 0, 1);
+        write_metadata(c->wrbuf, service, rec->metadata, flags, 1);
         for (ccount = 0, p = rl[i]->records; p;  p = p->next, ccount++)
-            write_subrecord(p, c->wrbuf, service, 0); // subrecs w/o details
+            write_subrecord(p, c->wrbuf, service, flags, 2);
         wrbuf_printf(c->wrbuf, " <count>%d</count>\n", ccount);
        if (strstr(sort, "relevance"))
         {
index 99a9e63..f2761f5 100644 (file)
@@ -28,6 +28,7 @@ union data_types {
     struct {
         const char *disp;
         const char *sort;
+        const char *snippet;
     } text;
     struct {
         int min;
index 0551980..4d5b6e4 100644 (file)
@@ -83,6 +83,47 @@ static struct word_entry *word_entry_match(struct relevance *r,
     return 0;
 }
 
+int relevance_snippet(struct relevance *r,
+                      const char *words, const char *name,
+                      WRBUF w_snippet)
+{
+    int no = 0;
+    const char *norm_str;
+#if 1
+    yaz_log(YLOG_LOG, "relevance_snippet for field=%s content=%s",
+            name, words);
+#endif
+    pp2_charset_token_first(r->prt, words, 0);
+
+    while ((norm_str = pp2_charset_token_next(r->prt)))
+    {
+        size_t org_start, org_len;
+        struct word_entry *entries = r->entries;
+        int highlight = 0;
+        int i;
+
+        pp2_get_org(r->prt, &org_start, &org_len);
+        for (; entries; entries = entries->next, i++)
+        {
+            yaz_log(YLOG_LOG, "Compare: %s %s", norm_str, entries->norm_str);
+            if (*norm_str && !strcmp(norm_str, entries->norm_str))
+                highlight = 1;
+        }
+        if (highlight)
+            wrbuf_puts(w_snippet, "<match>");
+
+        wrbuf_xmlputs_n(w_snippet, words + org_start, org_len);
+        if (highlight)
+            wrbuf_puts(w_snippet, "</match>");
+        no += highlight;
+    }
+    if (no)
+    {
+        yaz_log(YLOG_LOG, "SNIPPET match: %s", wrbuf_cstr(w_snippet));
+    }
+    return no;
+}
+
 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
                           const char *words, const char *rank,
                           const char *name)
index 5a095eb..76bbc22 100644 (file)
@@ -38,6 +38,10 @@ void relevance_newrec(struct relevance *r, struct record_cluster *cluster);
 void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
                           const char *words, const char *multiplier,
                           const char *name);
+int relevance_snippet(struct relevance *r,
+                      const char *words, const char *name,
+                      WRBUF w_snippet);
+
 void relevance_donerecord(struct relevance *r, struct record_cluster *cluster);
 
 void relevance_prepare_read(struct relevance *rel, struct reclist *rec);
index 880fc0c..e3e0a0d 100644 (file)
@@ -1467,6 +1467,7 @@ static struct record_metadata *record_metadata_init(
 
         rec_md->data.text.disp = p;
         rec_md->data.text.sort = 0;
+        rec_md->data.text.snippet = 0;
     }
     else if (type == Metadata_type_year || type == Metadata_type_date)
     {
@@ -1956,6 +1957,18 @@ static int ingest_to_cluster(struct client *cl,
                             "for element '%s'", value, type);
                 continue;
             }
+
+            if (ser_md->type == Metadata_type_generic)
+            {
+                WRBUF w = wrbuf_alloc();
+                if (relevance_snippet(se->relevance,
+                                      (char*) value, ser_md->name, w))
+                    rec_md->data.text.snippet = nmem_strdup(se->nmem,
+                                                            wrbuf_cstr(w));
+                wrbuf_destroy(w);
+            }
+
+
             wheretoput = &record->metadata[md_field_id];
             while (*wheretoput)
                 wheretoput = &(*wheretoput)->next;
@@ -2183,7 +2196,6 @@ static int ingest_to_cluster(struct client *cl,
                 relevance_countwords(se->relevance, cluster,
                                      (char *) value, rank, ser_md->name);
             }
-
             // construct facets ... unless the client already has reported them
             if (ser_md->termlist && !client_has_facet(cl, (char *) type))
             {