Added small test of mfile sub system

[idzebra-moved-to-github.git] / doc / administration.xml
diff --git a/doc/administration.xml b/doc/administration.xml

index 0048634..8905d11 100644 (file)
--- a/doc/administration.xml
+++ b/doc/administration.xml
@@ -1,5 +1,5 @@
  <chapter id="administration">
- <!-- $Id: administration.xml,v 1.36 2006-06-12 11:59:11 marc Exp $ -->
+ <!-- $Id: administration.xml,v 1.46 2006-10-11 12:23:24 adam Exp $ -->
   <title>Administrating Zebra</title>
   <!-- ### It's a bit daft that this chapter (which describes half of
            the configuration-file formats) is separated from
@@ -94,7 +94,7 @@
    
   </sect1>
   
- <sect1 id="configuration-file">
+ <sect1 id="zebra-cfg">
    <title>The Zebra Configuration File</title>
    
    <para>
@@ -281,20 +281,67 @@
        <para>
         Specifies a path of profile specification files. 
         The path is composed of one or more directories separated by
-       colon. Similar to PATH for UNIX systems.
+       colon. Similar to <literal>PATH</literal> for UNIX systems.
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry>
+      <term>modulePath: <replaceable>path</replaceable></term>
+      <listitem>
+       <para>
+       Specifies a path of record filter modules.
+       The path is composed of one or more directories separated by
+       colon. Similar to <literal>PATH</literal> for UNIX systems.
+       The 'make install' procedure typically puts modules in
+       <filename>/usr/local/lib/idzebra-2.0/modules</filename>.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>staticrank: <replaceable>integer</replaceable></term>
+      <listitem>
+       <para>
+       Enables whether static ranking is to be enabled (1) or
+       disabled (0). If omitted, it is disabled - corresponding
+       to a value of 0.
+       Refer to <xref linkend="administration-ranking-static"/> .
+       </para>
+      </listitem>
+     </varlistentry>
+
+
+     <varlistentry>
+      <term>estimatehits:: <replaceable>integer</replaceable></term>
+      <listitem>
+       <para>
+       Controls whether Zebra should calculate approximite hit counts and
+       at which hit count it is to be enabled.
+       A value of 0 disables approximiate hit counts.
+       For a positive value approximaite hit count is enabled
+       if it is known to be larger than <replaceable>integer</replaceable>.
+       </para>
+       <para>
+       Approximate hit counts can also be triggered by a particular
+       attribute in a query.
+       Refer to <xref linkend="querymodel-zebra-attr-approx"/>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term>attset: <replaceable>filename</replaceable></term>
       <listitem>
        <para>
-       Specifies the filename(s) of attribute set files for use in
-       searching. At least the Bib-1 set should be loaded
-       (<literal>bib1.att</literal>).
-       The <literal>profilePath</literal> setting is used to look for
-       the specified files.
-       See <xref linkend="attset-files"/>
+       Specifies the filename(s) of attribute set files for use in
+       searching. In many configurations <filename>bib1.att</filename>
+       is used, but that is not required. If Classic Explain
+       attributes is to be used for searching,
+       <filename>explain.att</filename> must be given.
+       The path to att-files in general can be given using 
+       <literal>profilePath</literal> setting.
+       See also <xref linkend="attset-files"/>.
        </para>
       </listitem>
      </varlistentry>
@@ -356,7 +403,7 @@
         Specifies a file with description of user accounts for Zebra.
         File format is similar to that used by the passwd directive except
         that the password are encrypted. Use Apache's htpasswd or similar
-       for maintenanace.
+       for maintenance.
        </para>
       </listitem>
      </varlistentry>
@@ -370,11 +417,12 @@
         to access Zebra via the passwd system. There are two kinds
         of permissions currently: read (r) and write(w). By default
         users not listed in a permission directive are given the read
-       priviledge. To specify permissions for a user with no
+       privilege. To specify permissions for a user with no
         username, or Z39.50 anonymous style use
         <literal>anonymous</literal>. The permstring consists of
         a sequence of characters. Include character <literal>w</literal>
-       for write/update access, <literal>r</literal> for read access.
+       for write/update access, <literal>r</literal> for read access and
+       <literal>a</literal> to allow anonymous access through this account.
        </para>
       </listitem>
      </varlistentry>
@@ -669,7 +717,7 @@
    </para>
    
    <para>
-   (see <xref linkend="record-model-grs"/>
+   (see <xref linkend="grs"/>
      for details of how the mapping between elements of your records and
      searchable attributes is established).
    </para>
@@ -757,7 +805,7 @@
   <sect1 id="shadow-registers">
    <title>Safe Updating - Using Shadow Registers</title>
    
-  <sect2>
+  <sect2 id="shadow-registers-description">
     <title>Description</title>
     
     <para>
@@ -811,7 +859,7 @@
     
    </sect2>
    
-  <sect2>
+  <sect2 id="shadow-registers-how-to-use">
     <title>How to Use Shadow Register Files</title>
     
     <para>
@@ -925,7 +973,7 @@
   <sect1 id="administration-ranking">
    <title>Relevance Ranking and Sorting of Result Sets</title>
  
-  <sect2>
+  <sect2 id="administration-overview">
     <title>Overview</title>
     <para>
      The default ordering of a result set is left up to the server,
@@ -1041,8 +1089,9 @@
      the Bib-1 relation attribute with
      value ``relevance'' to the PQF query (that is,
      <literal>@attr&nbsp;2=102</literal>, see also  
-    <ulink url="ftp://ftp.loc.gov/pub/z3950/defs/bib1.txt">
-     The BIB-1 Attribute Set Semantics</ulink>). 
+    <ulink url="&url.z39.50;bib1.html">
+     The BIB-1 Attribute Set Semantics</ulink>, also in 
+      <ulink url="&url.z39.50.attset.bib1;">HTML</ulink>). 
      To find all articles with the word <literal>Eoraptor</literal> in
      the title, and present them relevance ranked, issue the PQF query:
      <screen>
@@ -1287,7 +1336,6 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
      with or without static ranking enabled.
      </para>
   
-    </sect3>
  
      <!--
      <sect3 id="administration-ranking-dynamic-rank1">
@@ -1307,7 +1355,6 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
     </para>
      </sect3>
      -->
-
   
     <warning>
       <para>
@@ -1331,6 +1378,7 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
      mitaylor2microsoft.com
     -->
  
+    </sect3>
  
      <sect3 id="administration-ranking-dynamic-cql">
       <title>Dynamically ranking CQL queries</title>
@@ -1435,26 +1483,216 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
   <sect1 id="administration-extended-services">
    <title>Extended Services: Remote Insert, Update and Delete</title>
    
+   <note>
+    <para>
+     Extended services are only supported when accessing the Zebra
+     server using the <ulink url="&url.z39.50;">Z39.50</ulink>
+     protocol. The <ulink url="&url.sru;">SRU</ulink> protocol does
+     not support extended services.
+    </para>
+   </note>
+   
    <para>
      The extended services are not enabled by default in zebra - due to the
-    fact that they modify the system.
-    In order to allow anybody to update, use
-    <screen>
-    perm.anonymous: rw
-    </screen>
+    fact that they modify the system. Zebra can be configured
+    to allow anybody to
+    search, and to allow only updates for a particular admin user
      in the main zebra configuration file <filename>zebra.cfg</filename>.
-    Or, even better, allow only updates for a particular admin user. For
-    user <literal>admin</literal>, you could use:
+    For user <literal>admin</literal>, you could use:
      <screen>
+     perm.anonymous: r
       perm.admin: rw
       passwd: passwordfile
      </screen>
-    And in <filename>passwordfile</filename>, specify users and
-    passwords as colon seperated strings:
+    And in the password file 
+    <filename>passwordfile</filename>, you have to specify users and
+    encrypted passwords as colon separated strings. 
+    Use a tool like <filename>htpasswd</filename> 
+    to maintain the encrypted passwords. 
      <screen> 
       admin:secret
-    </screen> 
+    </screen>
+    It is essential to configure  Zebra to store records internally, 
+    and to support
+    modifications and deletion of records:
+    <screen>
+     storeData: 1
+     storeKeys: 1
+    </screen>
+    The general record type should be set to any record filter which
+    is able to parse XML records, you may use any of the two
+    declarations (but not both simultaneously!)
+    <screen>    
+     recordType: grs.xml
+     # recordType: alvis.filter_alvis_config.xml
+    </screen>
+    To enable transaction safe shadow indexing,
+    which is extra important for this kind of operation, set
+    <screen>
+     shadow: directoryname: size (e.g. 1000M)
+    </screen>
     </para>
+   <note>
+    <para>
+     It is not possible to carry information about record types or
+     similar to Zebra when using extended services, due to
+     limitations of the <ulink url="&url.z39.50;">Z39.50</ulink>
+     protocol. Therefore, indexing filters can not be chosen on a
+     per-record basis. One and only one general XML indexing filter
+     must be defined.  
+     <!-- but because it is represented as an OID, we would need some
+     form of proprietary mapping scheme between record type strings and
+     OIDs. -->
+     <!--
+     However, as a minimum, it would be extremely useful to enable
+     people to use MARC21, assuming grs.marcxml.marc21 as a record
+     type.  
+     -->
+    </para>
+   </note>
+
+
+   <sect2 id="administration-extended-services-z3950">
+    <title>Extended services in the Z39.50 protocol</title>
+
+    <para>
+     The <ulink url="&url.z39.50;">Z39.50</ulink> standard allows
+     servers to accept special binary <emphasis>extended services</emphasis>
+     protocol packages, which may be used to insert, update and delete
+     records into servers. These carry  control and update
+     information to the servers, which are encoded in seven package fields: 
+    </para>
+
+    <table id="administration-extended-services-z3950-table" frame="top">
+     <title>Extended services Z39.50 Package Fields</title>
+      <tgroup cols="3">
+       <thead>
+       <row>
+         <entry>Parameter</entry>
+         <entry>Value</entry>
+         <entry>Notes</entry>
+        </row>
+      </thead>
+       <tbody>
+        <row>
+         <entry><literal>type</literal></entry>
+         <entry><literal>'update'</literal></entry>
+         <entry>Must be set to trigger extended services</entry>
+        </row>
+        <row>
+         <entry><literal>action</literal></entry>
+         <entry><literal>string</literal></entry>
+        <entry>
+         Extended service action type with 
+         one of four possible values: <literal>recordInsert</literal>,
+         <literal>recordReplace</literal>,
+         <literal>recordDelete</literal>,
+         and <literal>specialUpdate</literal>
+        </entry>
+        </row>
+        <row>
+         <entry><literal>record</literal></entry>
+         <entry><literal>XML string</literal></entry>
+         <entry>An XML formatted string containing the record</entry>
+        </row>
+        <row>
+         <entry><literal>syntax</literal></entry>
+         <entry><literal>'xml'</literal></entry>
+         <entry>Only XML record syntax is supported</entry>
+        </row>
+        <row>
+         <entry><literal>recordIdOpaque</literal></entry>
+         <entry><literal>string</literal></entry>
+         <entry>
+         Optional  client-supplied, opaque record
+         identifier used under insert operations.
+        </entry>
+        </row>
+        <row>
+         <entry><literal>recordIdNumber </literal></entry>
+         <entry><literal>positive number</literal></entry>
+         <entry>Zebra's internal system number, only for update
+         actions.
+        </entry>
+        </row>
+        <row>
+         <entry><literal>databaseName</literal></entry>
+         <entry><literal>database identifier</literal></entry>
+        <entry>
+         The name of the database to which the extended services should be 
+         applied.
+        </entry>
+        </row>
+      </tbody>
+      </tgroup>
+     </table>
+
+
+   <para>
+    The <literal>action</literal> parameter can be any of 
+    <literal>recordInsert</literal> (will fail if the record already exists),
+    <literal>recordReplace</literal> (will fail if the record does not exist),
+    <literal>recordDelete</literal> (will fail if the record does not
+       exist), and
+    <literal>specialUpdate</literal> (will insert or update the record
+       as needed).
+   </para>
+
+    <para>
+     During a  <literal>recordInsert</literal> action, the
+     usual rules for internal record ID generation apply, unless an
+     optional <literal>recordIdNumber</literal> Zebra internal ID or a
+    <literal>recordIdOpaque</literal> string identifier is assigned. 
+     The default ID generation is
+     configured using the <literal>recordId:</literal> from
+     <filename>zebra.cfg</filename>.     
+    </para>
+
+   <para>
+    The actions <literal>recordReplace</literal> or
+    <literal>recordDelete</literal> need specification of the additional 
+    <literal>recordIdNumber</literal> parameter, which must be an
+    existing Zebra internal system ID number, or the optional 
+     <literal>recordIdOpaque</literal> string parameter.
+    </para>
+
+    <para>
+     When retrieving existing
+     records indexed with GRS indexing filters, the Zebra internal 
+     ID number is returned in the field
+    <literal>/*/id:idzebra/localnumber</literal> in the namespace
+    <literal>xmlns:id="http://www.indexdata.dk/zebra/"</literal>,
+    where it can be picked up for later record updates or deletes. 
+    </para>
+    <para>
+     Records indexed with the <literal>alvis</literal> filter
+     have similar means to discover the internal Zebra ID.
+    </para>
+ 
+   <para>
+     The <literal>recordIdOpaque</literal> string parameter
+     is an client-supplied, opaque record
+     identifier, which may be  used under 
+     insert, update and delete operations. The
+     client software is responsible for assigning these to
+     records.      This identifier will
+     replace zebra's own automagic identifier generation with a unique
+     mapping from <literal>recordIdOpaque</literal> to the 
+     Zebra internal <literal>recordIdNumber</literal>.
+     <emphasis>The opaque <literal>recordIdOpaque</literal> string
+     identifiers
+      are not visible in retrieval records, nor are
+      searchable, so the value of this parameter is
+      questionable. It serves mostly as a convenient mapping from
+      application domain string identifiers to Zebra internal ID's.
+     </emphasis> 
+    </para>
+   </sect2>
+
+   
+ <sect2 id="administration-extended-services-yaz-client">
+  <title>Extended services from yaz-client</title>
+
     <para>
      We can now start a yaz-client admin session and create a database:
     <screen>
@@ -1468,14 +1706,11 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
      from example/gils/records) and index it:
     <screen>  
      <![CDATA[
-     Z> update insert 1 esdd0006.grs
+     Z> update insert id1234 esdd0006.grs
       ]]>
     </screen>
-    The 3rd parameter - <literal>1</literal> here -
-      is the opaque record ID from <literal>Ext update</literal>.
-      It a record ID that <emphasis>we</emphasis> assign to the record
-    in question. If we do not 
-    assign one, the usual rules for match apply (recordId: from zebra.cfg).
+    The 3rd parameter - <literal>id1234</literal> here -
+      is the  <literal>recordIdOpaque</literal> package field.
     </para>
     <para>
      Actually, we should have a way to specify "no opaque record id" for
@@ -1497,10 +1732,11 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
      </screen>
     </para>
     <para>
-    Let's delete the beast:
+     Let's delete the beast, using the same 
+     <literal>recordIdOpaque</literal> string parameter:
      <screen>
      <![CDATA[
-     Z> update delete 1
+     Z> update delete id1234
       No last record (update ignored)
       Z> update delete 1 esdd0006.grs
       Got extended services response
@@ -1529,8 +1765,14 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
       after each update session in order write your changes from the
       shadow to the life register space.
     </para>
+ </sect2>
+
+  
+ <sect2 id="administration-extended-services-yaz-php">
+  <title>Extended services from yaz-php</title>
+
     <para>
-    Extended services are also available from the YAZ client layer. An
+    Extended services are also available from the YAZ PHP client layer. An
      example of an YAZ-PHP extended service transaction is given here:
      <screen>
      <![CDATA[
@@ -1550,119 +1792,10 @@ where g = rset_count(terms[i]->rset) is the count of all documents in this speci
         echo "$error";
       ]]>
      </screen>  
-    The <literal>action</literal> parameter can be any of 
-    <literal>recordInsert</literal> (will fail if the record already exists),
-    <literal>recordReplace</literal> (will fail if the record does not exist),
-    <literal>recordDelete</literal> (will fail if the record does not
-       exist), and
-    <literal>specialUpdate</literal> (will insert or update the record
-       as needed).
-   </para>
-   <para>
-    If a record is inserted
-    using the action  <literal>recordInsert</literal> 
-    one can specify the optional
-    <literal>recordIdOpaque</literal> parameter, which is a
-    client-supplied, opaque record identifier. This identifier will
-    replace zebra's own automagic identifier generation.  
-   </para>
-   <para>
-    When using the action <literal>recordReplace</literal> or
-    <literal>recordDelete</literal>, one must specify the additional 
-    <literal>recordIdNumber</literal> parameter, which must be an
-    existing Zebra internal system ID number. When retrieving existing
-    records, the ID number is returned in the field
-    <literal>/*/id:idzebra/localnumber</literal> in the namespace
-    <literal>xmlns:id="http://www.indexdata.dk/zebra/"</literal>,
-    where it can be picked up for later record updates or deletes. 
-   </para>
- </sect1>
-
-
-  <sect1 id="gfs-config">
-   <title>YAZ Frontend Virtual Hosts</title>
-    <para>
-     <command>zebrasrv</command> uses the YAZ server frontend and does
-     support multiple virtual servers behind multiple listening sockets.
      </para>
-    &zebrasrv-virtual;
- 
-   <para>
-    Section "Virtual Hosts" in the YAZ manual.
-    <filename>http://www.indexdata.dk/yaz/doc/server.vhosts.tkl</filename>
-   </para>
- </sect1>
-
-
-  <sect1 id="administration-cql-to-pqf">
-   <title>Server Side CQL to PQF Query Translation</title>
-   <para>
-    Using the
-    <literal>&lt;cql2rpn&gt;l2rpn.txt&lt;/cql2rpn&gt;</literal>
-      YAZ Frontend Virtual
-    Hosts option, one can configure
-    the YAZ Frontend CQL-to-PQF
-    converter, specifying the interpretation of various 
-    <ulink url="http://www.loc.gov/standards/sru/cql/">CQL</ulink>
-    indexes, relations, etc. in terms of Type-1 query attributes.
-    <!-- The  yaz-client config file -->  
-   </para>
-   <para>
-    For example, using server-side CQL-to-PQF conversion, one might
-    query a zebra server like this:
-    <screen>
-    <![CDATA[
-     yaz-client localhost:9999
-     Z> querytype cql
-     Z> find text=(plant and soil)
-     ]]>
-    </screen>
-     and - if properly configured - even static relevance ranking can
-     be performed using CQL query syntax:
-    <screen>
-    <![CDATA[
-     Z> find text = /relevant (plant and soil)
-     ]]>
-     </screen>
-   </para>
-
-   <para>
-    By the way, the same configuration can be used to 
-    search using client-side CQL-to-PQF conversion:
-    (the only difference is <literal>querytype cql2rpn</literal> 
-    instead of 
-    <literal>querytype cql</literal>, and the call specifying a local
-    conversion file)
-    <screen>
-    <![CDATA[
-     yaz-client -q local/cql2pqf.txt localhost:9999
-     Z> querytype cql2rpn
-     Z> find text=(plant and soil)
-     ]]>
-     </screen>
-   </para>
-
-   <para>
-    Exhaustive information can be found in the
-    Section "Specification of CQL to RPN mappings" in the YAZ manual.
-    <ulink url="http://www.indexdata.dk/yaz/doc/tools.tkl#tools.cql.map">
-     http://www.indexdata.dk/yaz/doc/tools.tkl#tools.cql.map</ulink>,
-   and shall therefore not be repeated here.
-   </para> 
-  <!-- 
-  <para>
-    See 
-      <ulink url="http://www.loc.gov/z3950/agency/zing/cql/dc-indexes.html">
-      http://www.loc.gov/z3950/agency/zing/cql/dc-indexes.html</ulink>
-    for the Maintenance Agency's work-in-progress mapping of Dublin Core
-    indexes to Attribute Architecture (util, XD and BIB-2)
-    attributes.
-   </para>
-   -->
+    </sect2>
   </sect1>
  
-
- 
  </chapter>
  
   <!-- Keep this comment at the end of the file