From: Adam Dickmeiss Date: Sun, 3 Sep 2006 21:37:26 +0000 (+0000) Subject: Change makefiles so that html files are automatically installed and X-Git-Tag: ZEBRA.2.0.2~32 X-Git-Url: http://sru.miketaylor.org.uk/cgi-bin?a=commitdiff_plain;h=37dc985516f52f34fc8434cc8beb982bb0c8988f;p=idzebra-moved-to-github.git Change makefiles so that html files are automatically installed and part of dist. No more maintenance of HTMLFILES ... Section about fields and character maps moved to separate chapter because it relates to all record types (not only grs). Added lots of id attributes to sections to get rid of docbook xsl warnings and to ensure proper HTML file name. Added Olegs paper on MARC indexing to grs chapter. --- diff --git a/doc/Makefile.am b/doc/Makefile.am index c678466..43308ed 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,4 +1,4 @@ -## $Id: Makefile.am,v 1.57 2006-08-14 19:33:38 adam Exp $ +## $Id: Makefile.am,v 1.58 2006-09-03 21:37:26 adam Exp $ docdir=$(datadir)/doc/$(PACKAGE)$(PACKAGE_SUFFIX) SUBDIRS = common @@ -7,6 +7,7 @@ XMLFILES = \ administration.xml \ architecture.xml \ examples.xml \ + field-structure.xml \ idzebra-config-man.xml \ indexdata.xml \ installation.xml \ @@ -28,55 +29,7 @@ XMLFILES = \ zebrasrv-synopsis.xml \ zebrasrv-virtual.xml -HTMLFILES = \ - administration-extended-services.html \ - administration-ranking.html \ - administration.html \ - architecture-maincomponents.html \ - architecture-workflow.html \ - architecture.html \ - configuration-file.html \ - example1.html \ - example2.html \ - examples.html \ - features.html \ - file-ids.html \ - future.html \ - generic-ids.html \ - gfs-config.html \ - grs-exchange-formats.html \ - grs-internal-representation.html \ - index.html \ - indexdata.html \ - installation-upgrade.html \ - installation.debian.html \ - installation.html \ - installation.win32.html \ - introduction-apps.html \ - introduction-support.html \ - introduction.html \ - license.html \ - locating-records.html \ - protocol-support.html \ - querymodel-cql-to-pqf.html \ - querymodel-pqf.html \ - querymodel-zebra.html \ - querymodel.html \ - quick-start.html \ - record-model-alvisxslt-conf.html \ - record-model-alvisxslt.html \ - record-model-grs-conf.html \ - record-model-grs.html \ - register-location.html \ - server-sru-support.html \ - server-sru.html \ - server.html \ - shadow-registers.html \ - simple-indexing.html \ - zebraidx.html \ - htmlhelp.hhp \ - toc.hhc - +HTMLFILES = index.html PNGFILES=zebra.png EPSFILES=zebra.eps @@ -108,6 +61,7 @@ idzebra-config$(PACKAGE_SUFFIX).1: idzebra-config-man.xml mv idzebra-config.1 idzebra-config$(PACKAGE_SUFFIX).1 $(HTMLFILES): $(XMLFILES) + rm -f *.html $(HTML_COMPILE) $(srcdir)/zebra.xml index.tkl: $(XMLFILES) @@ -135,21 +89,16 @@ clean-data-hook: rm -f [0-9]* *.bak dist-hook: - if test -f index.html; then for f in *.html; do \ - found=0; \ - b=`basename $$f`; \ - for h in $(HTMLFILES); do \ - if test "$$h" = "$$b"; then \ - found=1; \ - fi \ - done; \ - if test "$$found" = "0"; then \ - echo "$$f not found in HTMLFILES"; \ - exit 1; \ - fi \ - done; fi + if test -d index.html; then d=.; else d="$(srcdir)"; fi; \ + for p in $$d/*.html; do \ + cp $$p $(distdir); \ + done install-data-hook: cd $(DESTDIR)$(man1dir) && ln -sf zebraidx$(PACKAGE_SUFFIX).1 zebraidx.1 cd $(DESTDIR)$(man8dir) && ln -sf zebrasrv$(PACKAGE_SUFFIX).8 zebrasrv.8 cd $(DESTDIR)$(man1dir) && ln -sf idzebra-config$(PACKAGE_SUFFIX).1 idzebra-config.1 + if test -d index.html; then d=.; else d="$(srcdir)"; fi; \ + for p in $$d/*.html; \ + do $(docDATA_INSTALL) $$p $(DESTDIR)/$(docdir); \ + done diff --git a/doc/administration.xml b/doc/administration.xml index 0445860..beba166 100644 --- a/doc/administration.xml +++ b/doc/administration.xml @@ -1,5 +1,5 @@ - + Administrating Zebra + Overview of Zebra Architecture - +
Local Representation - + As mentioned earlier, Zebra places few restrictions on the type of data that you can index and manage. Generally, whatever the form of @@ -30,9 +30,9 @@ "grs" keyword, separated by "." characters. --> - +
- +
Main Components The Zebra system is designed to support a wide range of data management @@ -58,7 +58,7 @@ documentation and modules. - +
Core Zebra Libraries Containing Common Functionality The core Zebra module is the meat of the zebraidx @@ -129,10 +129,10 @@ idzebra-2.0-common includes common essential Zebra configuration files. - +
- +
Zebra Indexer The zebraidx @@ -145,9 +145,9 @@ The Debian package idzebra-2.0-utils contains the zebraidx utility. - +
- +
Zebra Searcher/Retriever This is the executable which runs the Z39.50/SRU/SRW server and @@ -158,9 +158,9 @@ The Debian package idzebra-2.0-utils contains the zebrasrv utility. - +
- +
YAZ Server Frontend The YAZ server frontend is @@ -190,9 +190,9 @@ It is packaged in the Debian packages yaz and libyaz. - +
- +
Record Models and Filter Modules The hard work of knowing what to index, @@ -209,18 +209,18 @@ - +
TEXT Record Model and Filter Module Plain ASCII text filter. TODO: add information here. - +
- +
GRS Record Model and Filter Modules The GRS filter modules described in - + are all based on the Z39.50 specifications, and it is absolutely mandatory to have the reference pages on BIB-1 attribute sets on you hand when configuring GRS filters. The GRS filters come in @@ -260,9 +260,9 @@ trees. Have also a look at the Alvis XML/XSLT filter described in the next session. - +
- +
ALVIS Record Model and Filter Module The Alvis filter for XML files is an XSLT based input @@ -302,23 +302,23 @@ The Debian package libidzebra-2.0-mod-alvis contains the Alvis filter module. - +
- +
- +
- +
Indexing and Retrieval Workflow @@ -368,9 +368,7 @@ - - - +
diff --git a/doc/entities.ent b/doc/entities.ent index b0713e4..2ad5801 100644 --- a/doc/entities.ent +++ b/doc/entities.ent @@ -1,4 +1,4 @@ - + @@ -10,6 +10,7 @@ + @@ -25,6 +26,6 @@ '> '> + '> '> diff --git a/doc/examples.xml b/doc/examples.xml index c8b89a3..298df4b 100644 --- a/doc/examples.xml +++ b/doc/examples.xml @@ -1,8 +1,8 @@ - + Example Configurations - + Overview diff --git a/doc/field-structure.xml b/doc/field-structure.xml new file mode 100644 index 0000000..c354795 --- /dev/null +++ b/doc/field-structure.xml @@ -0,0 +1,257 @@ + + + Field Structure and Character Sets + + + + In order to provide a flexible approach to national character set + handling, Zebra allows the administrator to configure the set up the + system to handle any 8-bit character set — including sets that + require multi-octet diacritics or other multi-octet characters. The + definition of a character set includes a specification of the + permissible values, their sort order (this affects the display in the + SCAN function), and relationships between upper- and lowercase + characters. Finally, the definition includes the specification of + space characters for the set. + + + + The operator can define different character sets for different fields, + typical examples being standard text fields, numerical fields, and + special-purpose fields such as WWW-style linkages (URx). + + +
+ The default.idx file + + The field types, and hence character sets, are associated with data + elements by the .abs files (see above). + The file default.idx + provides the association between field type codes (as used in the .abs + files) and the character map files (with the .chr suffix). The format + of the .idx file is as follows + + + + + + + index field type code + + + This directive introduces a new search index code. + The argument is a one-character code to be used in the + .abs files to select this particular index type. An index, roughly, + corresponds to a particular structure attribute during search. Refer + to . + + + + sort field code type + + + This directive introduces a + sort index. The argument is a one-character code to be used in the + .abs fie to select this particular index type. The corresponding + use attribute must be used in the sort request to refer to this + particular sort index. The corresponding character map (see below) + is used in the sort process. + + + + completeness boolean + + + This directive enables or disables complete field indexing. + The value of the boolean should be 0 + (disable) or 1. If completeness is enabled, the index entry will + contain the complete contents of the field (up to a limit), with words + (non-space characters) separated by single space characters + (normalized to " " on display). When completeness is + disabled, each word is indexed as a separate entry. Complete subfield + indexing is most useful for fields which are typically browsed (eg. + titles, authors, or subjects), or instances where a match on a + complete subfield is essential (eg. exact title searching). For fields + where completeness is disabled, the search engine will interpret a + search containing space characters as a word proximity search. + + + + charmap filename + + + This is the filename of the character + map to be used for this index for field type. + + + + +
+ +
+ The character map file format + + The contents of the character map files are structured as follows: + + + + + + + lowercase value-set + + + This directive introduces the basic value set of the field type. + The format is an ordered list (without spaces) of the + characters which may occur in "words" of the given type. + The order of the entries in the list determines the + sort order of the index. In addition to single characters, the + following combinations are legal: + + + + + + + + Backslashes may be used to introduce three-digit octal, or + two-digit hex representations of single characters + (preceded by x). + In addition, the combinations + \\, \\r, \\n, \\t, \\s (space — remember that real + space-characters may not occur in the value definition), and + \\ are recognized, with their usual interpretation. + + + + + + Curly braces {} may be used to enclose ranges of single + characters (possibly using the escape convention described in the + preceding point), eg. {a-z} to introduce the + standard range of ASCII characters. + Note that the interpretation of such a range depends on + the concrete representation in your local, physical character set. + + + + + + paranthesises () may be used to enclose multi-byte characters - + eg. diacritics or special national combinations (eg. Spanish + "ll"). When found in the input stream (or a search term), + these characters are viewed and sorted as a single character, with a + sorting value depending on the position of the group in the value + statement. + + + + + + + + + uppercase value-set + + + This directive introduces the + upper-case equivalencis to the value set (if any). The number and + order of the entries in the list should be the same as in the + lowercase directive. + + + + space value-set + + + This directive introduces the character + which separate words in the input stream. Depending on the + completeness mode of the field in question, these characters either + terminate an index entry, or delimit individual "words" in + the input stream. The order of the elements is not significant — + otherwise the representation is the same as for the + uppercase and lowercase + directives. + + + + map value-set + target + + + This directive introduces a mapping between each of the + members of the value-set on the left to the character on the + right. The character on the right must occur in the value + set (the lowercase directive) of the + character set, but it may be a paranthesis-enclosed + multi-octet character. This directive may be used to map + diacritics to their base characters, or to map HTML-style + character-representations to their natural form, etc. The + map directive can also be used to ignore leading articles in + searching and/or sorting, and to perform other special + transformations. See section . + + + + +
+
+ Ignoring leading articles + + In addition to specifying sort orders, space (blank) handling, + and upper/lowercase folding, you can also use the character map + files to make Zebra ignore leading articles in sorting records, + or when doing complete field searching. + + + This is done using the map directive in the + character map file. In a nutshell, what you do is map certain + sequences of characters, when they occur in the + beginning of a field, to a space. Assuming that the + character "@" is defined as a space character in your file, you + can do: + + map (^The\s) @ + map (^the\s) @ + + The effect of these directives is to map either 'the' or 'The', + followed by a space character, to a space. The hat ^ character + denotes beginning-of-field only when complete-subfield indexing + or sort indexing is taking place; otherwise, it is treated just + as any other character. + + + Because the default.idx file can be used to + associate different character maps with different indexing types + -- and you can create additional indexing types, should the need + arise -- it is possible to specify that leading articles should + be ignored either in sorting, in complete-field searching, or + both. + + + If you ignore certain prefixes in sorting, then these will be + eliminated from the index, and sorting will take place as if + they weren't there. However, if you set the system up to ignore + certain prefixes in searching, then these + are deleted both from the indexes and from query terms, when the + client specifies complete-field searching. This has the effect + that a search for 'the science journal' and 'science journal' + would both produce the same results. + +
+
+ diff --git a/doc/installation.xml b/doc/installation.xml index e1c77d3..c19627f 100644 --- a/doc/installation.xml +++ b/doc/installation.xml @@ -1,25 +1,23 @@ - + Installation Zebra is written in ANSI C and was implemented with portability in mind. - We primarily use GCC on UNIX and - - Microsoft Visual C++ - on Windows. + We primarily use GCC on UNIX and + Microsoft Visual C++ on Windows. The software is regularly tested on - Debian GNU/Linux, - Redhat Linux, - Gentoo Linux, - SuSE Linux, - FreeBSD (i386), - MAC OSX, - SunOS 5.8 + Debian GNU/Linux, + Redhat Linux, + Gentoo Linux, + SuSE Linux, + FreeBSD (i386), + MAC OSX, + SunOS 5.8 (sparc), - Windows 2000. + Windows 2000. @@ -28,17 +26,18 @@ - yaz + yaz (required) - Zebra uses YAZ to support Z39.50/ SRW. Also the memory management - utilites from YAZ is used by Zebra. + Zebra uses YAZ to support Z39.50 / + SRU. + Also the memory management utilites from YAZ is used by Zebra. - iconv + iconv (optional) @@ -49,7 +48,7 @@ - Expat + Expat (optional) @@ -61,19 +60,7 @@ - Perl (optional) - - - Perl is required if you're going to use the Zebra perl - filter facility or the Zebra perl API. Perl is preinstalled - on many Unixes. We've not tried the Perl extension on - Windows ourselves. - - - - - - Tcl (optional) + Tcl (optional) Tcl is required if you need to use the Tcl record filter @@ -85,8 +72,8 @@ - Autoconf, - Automake + Autoconf, + Automake (optional) @@ -98,7 +85,7 @@ - Docbook + Docbook and friends (optional) @@ -112,7 +99,7 @@ - UNIX +
UNIX On Unix, GCC works fine, but any native C compiler should be possible to use as long as it is @@ -201,10 +188,10 @@ You can override this with the --prefix option to configure. - +
- GNU/Debian - GNU/Debian Linux on + <section id="installation-debian"><title>GNU/Debian +
GNU/Debian Linux on i686 Platform Index Data provides pre-compiled GNU/Debian i686 Linux packages @@ -230,18 +217,18 @@ apt-get update as root, the - Zebra indexer is + Zebra indexer is easily installed issuing apt-get install idzebra-2.0 idzebra-2.0-doc - +
- +
Ubuntu/Debian and GNU/Debian on other platforms - These Zebra + These Zebra packages are specifically compiled for GNU/Debian Linux systems. Installation on other GNU/Debian systems is possible by @@ -262,7 +249,7 @@ apt-get build-dep idzebra-2.0 as root, the - Zebra indexer is + Zebra indexer is recompiled and installed issuing apt-get source --compile idzebra-2.0 @@ -274,10 +261,10 @@ dpkg -i install idzebra-2.0*.deb libidzebra-2.0*.deb - - +
+ - WIN32 +
WIN32 The easiest way to install Zebra on Windows is by downloading an installer from here. @@ -287,10 +274,9 @@ Zebra is shipped with "makefiles" for the NMAKE tool that comes - with - Microsoft Visual C++. - Version 6 has been tested. We expect that zebra compiles - with version 5 as well. + with Microsoft Visual C++. + Version 2003 and 2005 has been tested. We expect that zebra compiles + with version 6 as well. Start a command prompt and switch the sub directory @@ -327,12 +313,11 @@ EXPAT_DIR If HAVE_EXPAT is set to 1, Zebra is compiled - with Expat support. In this configuration, set - ZEBRA_DIR to the Expat source directory. + with Expat support. + In this configuration, set + ZEBRA_DIR to the Expat source directory. Windows version of Expat can be downloaded from - - SourceForge - . + SourceForge. @@ -344,9 +329,7 @@ with iconv support. In this configuration, set ICONV_DIR to the iconv source directory. Iconv binaries can be downloaded from - - this site - . + this site. @@ -358,8 +341,7 @@ Define these symbols if Zebra is to be compiled with - BZIP2 - record compression support. + BZIP2 record compression support. @@ -414,10 +396,10 @@ - +
- +
Upgrading from Zebra version 1.3.x Zebra's installation directories have changed a bit. In addition, @@ -491,7 +473,7 @@ attset: idxpath.att - +
+ Introduction - +
Overview @@ -34,9 +34,9 @@ and how to configure the server to give you the functionality that you need. - +
- +
Features @@ -228,9 +228,9 @@ - +
- +
References and Zebra based Applications Zebra has been deployed in numerous applications, in both the @@ -245,7 +245,7 @@ - +
Koha free open-source ILS Koha is a full-featured @@ -300,9 +300,9 @@ Koha Earns its Stripes. - +
- +
Emilda open source ILS Emilda @@ -320,9 +320,9 @@ As a surplus, 100% MARC compatibility has been achieved using the Zebra Server from Index Data as backend server. - +
- +
ReIndex.Net web based ILS Reindex.net @@ -342,12 +342,12 @@ Internally MARCXML is used for bibliographical records. Update utilizes Z39.50 extended services. - +
- - - DADS - the DTV Article Database Service - +
+ DADS - the DTV Article Database + Service + DADS is a huge database of more than ten million records, totalling over ten gigabytes of data. The records are metadata about academic journal articles, primarily scientific; about 10% of these @@ -368,9 +368,9 @@ and - +
- +
Infonet Eprints The InfoNet Eprints service from the @@ -387,9 +387,9 @@ The online search facility is found at . - +
- +
Alvis The Alvis EU @@ -410,10 +410,10 @@ in about 4 hours, resulting in search times of fractions of seconds. - +
- +
ULS (Union List of Serials) The M25 Systems Team @@ -439,9 +439,9 @@ More information can be found at - +
- +
NLI-Z39.50 - a Natural Language Interface for Libraries Fernuniversität Hagen in Germany have developed a natural @@ -469,9 +469,9 @@ For more information, contact Johannes Leveling Johannes.Leveling@FernUni-Hagen.De - +
- +
Various web indexes Zebra has been used by a variety of institutions to construct @@ -526,11 +526,11 @@ - - +
+
- +
Support You can get support for Zebra from at least three sources. @@ -558,10 +558,10 @@ for details. - +
- +
Future Directions @@ -658,7 +658,7 @@ or check the contact info at the end of this manual. - +
+ License @@ -28,7 +28,8 @@ 02111-1307, USA. - GNU General Public License + + GNU General Public License GNU GENERAL PUBLIC LICENSE Version 2, June 1991 diff --git a/doc/querymodel.xml b/doc/querymodel.xml index 82d25ec..bbadb28 100644 --- a/doc/querymodel.xml +++ b/doc/querymodel.xml @@ -1,11 +1,11 @@ - + Query Model - +
Query Model Overview - +
Query Languages @@ -29,7 +29,7 @@ - +
Prefix Query Format (PQF) Index Data has defined a textual representation in the @@ -44,9 +44,9 @@ for further explanations and descriptions of Zebra's capabilities. - +
- +
Common Query Language (CQL) The query model of the type-1 RPN, @@ -59,11 +59,11 @@ Zebra can be configured to understand and map CQL to PQF. See . - +
- +
- +
Operation types Zebra supports all of the three different @@ -73,7 +73,7 @@ functionality and purpose of each is quite in order here. - +
Explain Operation The syntax of Z39.50/SRU queries is @@ -106,9 +106,9 @@ auto-configure a client user interface to the servers capabilities. - +
- + - +
Scan Operation The scan operation is a helper functionality, @@ -137,14 +137,14 @@ spelling of search terms, to auto-fill search boxes, or to display controlled vocabularies. - +
- +
- +
- +
Prefix Query Format syntax and semantics The PQF grammar @@ -155,7 +155,7 @@ query parse tree. - +
PQF tree structure The PQF parse tree - or the equivalent textual representation - @@ -169,7 +169,7 @@ complex query trees. - +
Attribute sets Attribute sets define the exact meaning and semantics of queries @@ -244,9 +244,9 @@ - +
- +
Boolean operators A pair of sub query trees, or of atomic queries, is combined @@ -334,10 +334,10 @@ Z> find "information retrieval" - +
- +
Atomic queries (APT) Atomic queries are the query parts which work on one access point @@ -421,10 +421,10 @@ Z> scan @attr 1=4 debussy - +
- +
Named Result Sets Named result sets are supported in Zebra, and result sets can be @@ -466,9 +466,9 @@ the SRU protocol. - +
- +
Zebra's special access point of type 'string' The numeric use (type 1) attribute is usually @@ -511,13 +511,13 @@ See also for details, and - + for the SRU PQF query extension using string names as a fast debugging facility. - +
- +
Zebra's special access point of type 'XPath' for GRS filters @@ -536,7 +536,7 @@ When using the GRS Record Model - (see ), we have the + (see ), we have the possibility to embed life XPath expressions in the PQF queries, which are here called @@ -624,10 +624,10 @@ size is medium to large. - - +
+
- +
Explain Attribute Set The Z39.50 standard defines the @@ -652,7 +652,7 @@ within any explain query. - +
Use Attributes (type = 1) The following Explain search attributes are supported: @@ -672,9 +672,9 @@ Z39.50 standard for more information. - +
- +
Explain searches with yaz-client Classic Explain only defines retrieval of Explain information @@ -755,11 +755,11 @@ Z> find @attrset exp1 @and @attr 1=1 attributedetails @attr 1=3 Default - +
- +
- +
Bib1 Attribute Set Most of the information contained in this section is an excerpt of @@ -775,7 +775,7 @@ - +
Use Attributes (type 1) @@ -830,15 +830,15 @@ Z> scan @attr 1=4 information - +
- +
- +
Zebra general Bib1 Non-Use Attributes (type 2-6) - +
Relation Attributes (type 2) @@ -982,9 +982,9 @@ - +
- +
Position Attributes (type 3) @@ -1029,9 +1029,9 @@ any position in field (3). A proper diagnostic should have been issued. - +
- +
Structure Attributes (type 4) @@ -1209,9 +1209,9 @@ . - +
- +
Truncation Attributes (type = 5) @@ -1338,9 +1338,9 @@ ... - +
- +
Completeness Attributes (type = 6) @@ -1411,13 +1411,13 @@ . - - +
+
- +
- +
Advanced Zebra PQF Features The Zebra internal query engine has been extended to specific needs @@ -1431,7 +1431,7 @@ idxpath attribute set. - +
Zebra specific retrieval of all records Zebra defines a hardwired string index name @@ -1464,9 +1464,9 @@ well change in future releases of Zebra. - +
- + - +
Zebra specific Scan Extensions to all Attribute Sets Zebra extends the Bib1 attribute types, and these extensions are @@ -1729,7 +1729,7 @@ - +
Zebra Extension Result Set Narrow (type 8) If attribute Result Set Narrow (type 8) @@ -1770,9 +1770,9 @@ Experimental. Do not use in production code. - +
- +
Zebra Extension Approximative Limit (type 11) The Zebra Extension Approximative Limit (type 11) is a way to @@ -1790,10 +1790,10 @@ Experimental and buggy. Definitely not to be used in production code. - - +
+
- +
Zebra special IDXPATH Attribute Set for GRS indexing The attribute-set idxpath consists of a single @@ -1815,7 +1815,7 @@ - +
IDXPATH Use Attributes (type = 1) This attribute set allows one to search GRS filter indexed @@ -1939,11 +1939,11 @@ - - +
+
- +
Mapping from PQF atomic APT queries to Zebra internal register indexes @@ -1955,7 +1955,7 @@ the named register. - +
Mapping of PQF APT access points Zebra understands four fundamental different types of access @@ -2088,10 +2088,10 @@ - +
- +
Mapping of PQF APT structure and completeness to register type @@ -2298,10 +2298,10 @@ contents. - - +
+
- +
Zebra Regular Expressions in Truncation Attribute (type = 5) @@ -2405,7 +2405,7 @@ Z> find @attr 1=4 @attr 5=102 @attr 2=102 "informat.* retrieval" - +
- +
- +
Server Side CQL to PQF Query Translation Using the @@ -2489,9 +2489,7 @@ attributes. --> - - - +
diff --git a/doc/recordmodel-alvisxslt.xml b/doc/recordmodel-alvisxslt.xml index 970e922..d067622 100644 --- a/doc/recordmodel-alvisxslt.xml +++ b/doc/recordmodel-alvisxslt.xml @@ -1,5 +1,5 @@ - + ALVIS XML Record Model and Filter Module @@ -19,7 +19,7 @@ - +
ALVIS Record Filter The experimental, loadable Alvis XML/XSLT filter module @@ -76,7 +76,7 @@ identifier="http://indexdata.dk/zebra/xslt/1". - +
ALVIS Internal Record Representation When indexing, an XML Reader is invoked to split the input files into suitable record XML pieces. Each record piece is then @@ -89,9 +89,9 @@ you can use this functionality inside the Alvis filter configuration XSLT stylesheets. - +
- +
ALVIS Canonical Indexing Format The output of the indexing XSLT stylesheets must contain certain elements in the magic @@ -212,7 +212,7 @@ http://localhost:9999/?version=1.1&operation=scan&x-pScanClause=@attr+1=dc:date+@attr+4=2+a ]]> - See for more information on SRU/SRW + See for more information on SRU/SRW configuration, and or the YAZ manual CQL section @@ -227,15 +227,15 @@ filter configuration files involves in this process, and that the literal index names are used during search and retrieval. - - +
+
- +
ALVIS Record Model Configuration - +
ALVIS Indexing Configuration As mentioned above, there can be only one indexing @@ -373,9 +373,9 @@ to suffering and pain, and universal disentigration of your project schedule. - +
- +
ALVIS Exchange Formats An exchange format can be anything which can be the outcome of an @@ -422,9 +422,9 @@ - +
- +
ALVIS Filter OAI Indexing Example The sourcecode tarball contains a working Alvis filter example in @@ -444,9 +444,9 @@ http://www.oaforum.org/tutorial/. - +
- +
diff --git a/doc/recordmodel-grs.xml b/doc/recordmodel-grs.xml index a2c798e..bd11180 100644 --- a/doc/recordmodel-grs.xml +++ b/doc/recordmodel-grs.xml @@ -1,7 +1,6 @@ - - + + GRS Record Model and Filter Modules - The record model described in this chapter applies to the fundamental, @@ -11,7 +10,7 @@ - +
GRS Record Filters Many basic subtypes of the grs type are @@ -21,120 +20,116 @@ - grs.sgml + grs.sgml This is the canonical input format described . It is using simple SGML-like syntax. - - grs.marc + grs.marc.type This allows Zebra to read records in the ISO2709 (MARC) encoding standard. - + well as the indexing rules. + + The grs.marc uses an internal represtantion + which is not XML conformant. In particular MARC tags are + presented as elements with the same name. And XML elements + may not start with digits. Therefore this filter is only + suitable for systems returning GRS-1 and MARC records. For XML + use grs.marcxml filter instead (see below). The loadable grs.marc filter module is packaged in the GNU/Debian package - libidzebra1.4-mod-grs-marc - + libidzebra2.0-mod-grs-marc + - grs.marcxml + grs.marcxml.type - This allows Zebra to read - records in the ISO2709??? (MARCXML) encoding standard. + This allows Zebra to read ISO2709 encoded records. + Last parameter type names the + .abs file (see below) + which describes the specific MARC structure of the input record as + well as the indexing rules. - The loadable grs.marcxml filter module - is also contained in the GNU/Debian package - libidzebra1.4-mod-grs-marc - - - - - grs.danbib - - - The grs.danbib filter parses DanBib - records, a danish MARC record variant called DANMARC. - DanBib is the Danish Union Catalogue hosted by the - Danish Bibliographic Centre (DBC). + The internal representation for grs.marcxml + is the same as for MARCXML. + It slightly more complicated to work with than + grs.marc but XML conformant. - The loadable grs.danbib filter module - is packages in the GNU/Debian package - libidzebra1.4-mod-grs-danbib. + + The loadable grs.marcxml filter module + is also contained in the GNU/Debian package + libidzebra2.0-mod-grs-marc - grs.xml + grs.xml - This filter reads XML records and uses Expat to + This filter reads XML records and uses + Expat to parse them and convert them into IDZebra's internal grs record model. - Only one record per file - is supported. The filter is only available if Zebra/YAZ - is compiled with EXPAT support. + Only one record per file is supported, due to the fact XML does + not allow two documents to "follow" each other (there is no way + to know when a document is finished). + This filter is only available if Zebra is compiled with EXPAT support. - The loadable grs.xml filter module - is packagged in the GNU/Debian package - libidzebra1.4-mod-grs-xml + The loadable grs.xml filter module + is packagged in the GNU/Debian package + libidzebra2.0-mod-grs-xml - grs.regx + grs.regx.filter This enables a user-supplied Regular Expressions input - filter described in - . + filter described in . - The loadable grs.regx filter module - is packaged in the GNU/Debian package - libidzebra1.4-mod-grs-regx - + The loadable grs.regx filter module + is packaged in the GNU/Debian package + libidzebra2.0-mod-grs-regx + - grs.tcl + grs.tcl.filter Similar to grs.regx but using Tcl for rules, described in . - The loadable grs.tcl filter module - is also packaged in the GNU/Debian package - libidzebra1.4-mod-grs-regx - + The loadable grs.tcl filter module + is also packaged in the GNU/Debian package + libidzebra2.0-mod-grs-regx + - +
GRS Canonical Input Format @@ -207,7 +202,7 @@ structured data element such a Supplier element. - +
Record Root @@ -234,9 +229,9 @@ - +
- +
Variants @@ -272,7 +267,7 @@ The available values for the class and type fields are given by the variant set that is associated with the current schema - (see ). + (see ). @@ -331,11 +326,11 @@ of the end-user. - +
- +
- +
GRS REGX And TCL Input Filters @@ -578,11 +573,11 @@ mechanisms for modifying the elements of a record. - +
- +
- +
GRS Internal Record Representation @@ -633,7 +628,7 @@ different tag path. - +
Tagged Elements @@ -650,9 +645,9 @@ reached from the root of the record). - +
- +
Variants @@ -686,9 +681,9 @@ type, value, corresponding to the variant mechanism of Z39.50. - +
- +
Data Elements @@ -702,11 +697,11 @@ --> - +
- +
- +
GRS Record Model Configuration @@ -717,7 +712,7 @@ setting in the zebra.cfg file. - +
The Abstract Syntax @@ -810,9 +805,9 @@ describe the given objects. - +
- +
The Configuration Files @@ -841,9 +836,9 @@ mandatory (m). - +
- +
The Abstract Syntax (.abs) Files @@ -954,7 +949,7 @@ - any tags + all tags (o) This directive specifies a list of attributes @@ -981,16 +976,16 @@ the attributes specifies which attributes to use when indexing the element in a comma-separated list. - A ! in place of the attribute name is equivalent to - specifying an attribute name identical to the element name. - A - in place of the attribute name + A ! in place of the attribute name is equivalent + to specifying an attribute name identical to the element name. + A - in place of the attribute name specifies that no indexing is to take place for the given element. The attributes can be qualified with field types to specify which character set should govern the indexing procedure for that field. The same data element may be indexed into several different fields, using different character set definitions. - See the . + See the . The default field type is w for word. @@ -1208,9 +1203,9 @@ - +
- +
The Attribute Set (.att) Files @@ -1294,9 +1289,9 @@ - +
- +
The Tag Set (.tag) Files @@ -1452,9 +1447,9 @@ - +
- +
The Variant Set (.var) Files @@ -1533,9 +1528,9 @@ - +
- +
The Element Set (.est) Files @@ -1673,9 +1668,9 @@ - +
- +
The Schema Mapping (.map) Files @@ -1737,9 +1732,9 @@ - +
- +
The MARC (ISO2709) Representation (.mar) Files @@ -1754,253 +1749,10 @@ handled by the system. --> - - - - Field Structure and Character Sets - - - - In order to provide a flexible approach to national character set - handling, Zebra allows the administrator to configure the set up the - system to handle any 8-bit character set — including sets that - require multi-octet diacritics or other multi-octet characters. The - definition of a character set includes a specification of the - permissible values, their sort order (this affects the display in the - SCAN function), and relationships between upper- and lowercase - characters. Finally, the definition includes the specification of - space characters for the set. - - - - The operator can define different character sets for different fields, - typical examples being standard text fields, numerical fields, and - special-purpose fields such as WWW-style linkages (URx). - - - - The default.idx file - - The field types, and hence character sets, are associated with data - elements by the .abs files (see above). - The file default.idx - provides the association between field type codes (as used in the .abs - files) and the character map files (with the .chr suffix). The format - of the .idx file is as follows - - - - - - - index field type code - - - This directive introduces a new search index code. - The argument is a one-character code to be used in the - .abs files to select this particular index type. An index, roughly, - corresponds to a particular structure attribute during search. Refer - to . - - - - sort field code type - - - This directive introduces a - sort index. The argument is a one-character code to be used in the - .abs fie to select this particular index type. The corresponding - use attribute must be used in the sort request to refer to this - particular sort index. The corresponding character map (see below) - is used in the sort process. - - - - completeness boolean - - - This directive enables or disables complete field indexing. - The value of the boolean should be 0 - (disable) or 1. If completeness is enabled, the index entry will - contain the complete contents of the field (up to a limit), with words - (non-space characters) separated by single space characters - (normalized to " " on display). When completeness is - disabled, each word is indexed as a separate entry. Complete subfield - indexing is most useful for fields which are typically browsed (eg. - titles, authors, or subjects), or instances where a match on a - complete subfield is essential (eg. exact title searching). For fields - where completeness is disabled, the search engine will interpret a - search containing space characters as a word proximity search. - - - - charmap filename - - - This is the filename of the character - map to be used for this index for field type. - - - - - - - - The character map file format - - The contents of the character map files are structured as follows: - +
+
- - - - - lowercase value-set - - - This directive introduces the basic value set of the field type. - The format is an ordered list (without spaces) of the - characters which may occur in "words" of the given type. - The order of the entries in the list determines the - sort order of the index. In addition to single characters, the - following combinations are legal: - - - - - - - - Backslashes may be used to introduce three-digit octal, or - two-digit hex representations of single characters - (preceded by x). - In addition, the combinations - \\, \\r, \\n, \\t, \\s (space — remember that real - space-characters may not occur in the value definition), and - \\ are recognized, with their usual interpretation. - - - - - - Curly braces {} may be used to enclose ranges of single - characters (possibly using the escape convention described in the - preceding point), eg. {a-z} to introduce the - standard range of ASCII characters. - Note that the interpretation of such a range depends on - the concrete representation in your local, physical character set. - - - - - - paranthesises () may be used to enclose multi-byte characters - - eg. diacritics or special national combinations (eg. Spanish - "ll"). When found in the input stream (or a search term), - these characters are viewed and sorted as a single character, with a - sorting value depending on the position of the group in the value - statement. - - - - - - - - - uppercase value-set - - - This directive introduces the - upper-case equivalencis to the value set (if any). The number and - order of the entries in the list should be the same as in the - lowercase directive. - - - - space value-set - - - This directive introduces the character - which separate words in the input stream. Depending on the - completeness mode of the field in question, these characters either - terminate an index entry, or delimit individual "words" in - the input stream. The order of the elements is not significant — - otherwise the representation is the same as for the - uppercase and lowercase - directives. - - - - map value-set - target - - - This directive introduces a mapping between each of the - members of the value-set on the left to the character on the - right. The character on the right must occur in the value - set (the lowercase directive) of the - character set, but it may be a paranthesis-enclosed - multi-octet character. This directive may be used to map - diacritics to their base characters, or to map HTML-style - character-representations to their natural form, etc. The - map directive can also be used to ignore leading articles in - searching and/or sorting, and to perform other special - transformations. See section . - - - - - - - Ignoring leading articles - - In addition to specifying sort orders, space (blank) handling, - and upper/lowercase folding, you can also use the character map - files to make Zebra ignore leading articles in sorting records, - or when doing complete field searching. - - - This is done using the map directive in the - character map file. In a nutshell, what you do is map certain - sequences of characters, when they occur in the - beginning of a field, to a space. Assuming that the - character "@" is defined as a space character in your file, you - can do: - - map (^The\s) @ - map (^the\s) @ - - The effect of these directives is to map either 'the' or 'The', - followed by a space character, to a space. The hat ^ character - denotes beginning-of-field only when complete-subfield indexing - or sort indexing is taking place; otherwise, it is treated just - as any other character. - - - Because the default.idx file can be used to - associate different character maps with different indexing types - -- and you can create additional indexing types, should the need - arise -- it is possible to specify that leading articles should - be ignored either in sorting, in complete-field searching, or - both. - - - If you ignore certain prefixes in sorting, then these will be - eliminated from the index, and sorting will take place as if - they weren't there. However, if you set the system up to ignore - certain prefixes in searching, then these - are deleted both from the indexes and from query terms, when the - client specifies complete-field searching. This has the effect - that a search for 'the science journal' and 'science journal' - would both produce the same results. - - - -
- - +
GRS Exchange Formats @@ -2085,8 +1837,326 @@ - +
+ +
+ Extended indexing of MARC records + + Extended indexing of MARC records will help you if you need index a + combination of subfields, or index only a part of the whole field, + or use during indexing process embedded fields of MARC record. + + + Extended indexing of MARC records additionally allows: + + + + to index data in LEADER of MARC record + + + + to index data in control fields (with fixed length) + + + + to use during indexing the values of indicators + + + + to index linked fields for UNIMARC based formats + + + + + + In compare with simple indexing process the extended indexing + may increase (about 2-3 times) the time of indexing process for MARC + records. + +
+ The index-formula + + At the beginning, we have to define the term + index-formula for MARC records. This term helps + to understand the notation of extended indexing of MARC records by Zebra. + Our definition is based on the document + "The table + of conformity for Z39.50 use attributes and RUSMARC fields". + The document is available only in russian language. + + + The index-formula is the combination of + subfields presented in such way: + + + + 71-00$a, $g, $h ($c){.$b ($c)} , (1) + + + + We know that Zebra supports a Bib-1 attribute - right truncation. + In this case, the index-formula (1) consists from + forms, defined in the same way as (1) + + + 71-00$a, $g, $h + 71-00$a, $g + 71-00$a + + + + The original MARC record may be without some elements, which included in index-formula. + + + + This notation includes such operands as: + + + + # + It means whitespace character. + + + + - + The position may contain any value, defined by + MARC format. + For example, index-formula + + + 70-#1$a, $g , (2) + + + includes + + + 700#1$a, $g + 701#1$a, $g + 702#1$a, $g + + + + + + + {...} + + The repeatable elements are defined in figure-brackets {}. + For example, + index-formula + + + 71-00$a, $g, $h ($c){.$b ($c)} , (3) + + + includes + + + 71-00$a, $g, $h ($c). $b ($c) + 71-00$a, $g, $h ($c). $b ($c). $b ($c) + 71-00$a, $g, $h ($c). $b ($c). $b ($c). $b ($c) + + + + + + + + + All another operands are the same as accepted in MARC world. + + + +
+ +
+ Notation of <emphasis>index-formula</emphasis> for Zebra + + + Extended indexing overloads path of + elm definition in abstract syntax file of Zebra + (.abs file). It means that names beginning with + "mc-" are interpreted by Zebra as + index-formula. The database index is created and + linked with access point (Bib-1 use attribute) + according to this formula. + + For example, index-formula + + + 71-00$a, $g, $h ($c){.$b ($c)} , (4) + + + in .abs file looks like: + + + mc-71.00_$a,_$g,_$h_(_$c_){.$b_(_$c_)} + + + + The notation of index-formula uses the operands: + + + + _ + It means whitespace character. + + + + . + The position may contain any value, defined by + MARC format. For example, + index-formula + + + 70-#1$a, $g , (5) + + + matches mc-70._1_$a,_$g_ and includes + + + 700_1_$a,_$g_ + 701_1_$a,_$g_ + 702_1_$a,_$g_ + + + + + + {...} + The repeatable elements are defined in + figure-brackets {}. For example, + index-formula + + + 71#00$a, $g, $h ($c) {.$b ($c)} , (6) + + + matches + mc-71.00_$a,_$g,_$h_(_$c_){.$b_(_$c_)} and + includes + + + 71.00_$a,_$g,_$h_(_$c_).$b_(_$c_) + 71.00_$a,_$g,_$h_(_$c_).$b_(_$c_).$b_(_$c_) + 71.00_$a,_$g,_$h_(_$c_).$b_(_$c_).$b_(_$c_).$b_(_$c_) + + + + + + <...> + Embedded index-formula (for + linked fields) is between <>. For example, + index-formula + + + + 4--#-$170-#1$a, $g ($c) , (7) + + + matches + mc-4.._._$1<70._1_$a,_$g_(_$c_)>_ and + includes + + + 463_._$1<70._1_$a,_$g_(_$c_)>_ + + + + + + + + + All another operands are the same as accepted in MARC world. + + +
+ Examples + + + + + + + indexing LEADER + + You need to use keyword "ldr" to index leader. For example, + indexing data from 6th and 7th position of LEADER + + + elm mc-ldr[6] Record-type ! + elm mc-ldr[7] Bib-level ! + + + + + + + indexing data from control fields + + indexing date (the time added to database) + + + elm mc-008[0-5] Date/time-added-to-db ! + + + or for RUSMARC (this data included in 100th field) + + + elm mc-100___$a[0-7]_ Date/time-added-to-db ! + + + + + + + using indicators while indexing + + For RUSMARC index-formula + 70-#1$a, $g matches + + + elm 70._1_$a,_$g_ Author !:w,!:p + + + When Zebra finds a field according to + "70." pattern it checks the indicators. In this + case the value of first indicator doesn't mater, but the value of + second one must be whitespace, in another case a field is not + indexed. + + + + + indexing embedded (linked) fields for UNIMARC based + formats + + For RUSMARC index-formula + 4--#-$170-#1$a, $g ($c) matches + + _ Author !:w,!:p + ]]> + + Data are extracted from record if the field matches to + "4.._." pattern and data in linked field + match to embedded + index-formula + 70._1_$a,_$g_(_$c_). + + + + + + + +
+
+
+
+ + The Z39.50 Server - + Running the Z39.50 Server (zebrasrv) - Description + Description Zebra is a high-performance, general-purpose structured text indexing and retrieval engine. It reads structured records in a variety of input formats (eg. email, XML, MARC) and allows access to them through exact @@ -35,12 +35,12 @@ - + Synopsis &zebrasrv-synopsis; - + Options @@ -53,12 +53,12 @@ &zebrasrv-options; - Files + Files zebra.cfg - See Also + See Also zebraidx @@ -205,7 +205,7 @@ Z39.50 Protocol Support and Behavior - + Initialization @@ -244,7 +244,7 @@ - + Present The present facility is supported in a standard fashion. The requested @@ -254,7 +254,7 @@ provided by the relevant record profiles. - + Scan The attribute combinations provided with the termListAndStartPoint are @@ -263,7 +263,7 @@ the termInfo structure. - + Sort @@ -283,7 +283,7 @@ be the same as the output result set. - + Close If a Close PDU is received, the server will respond with a Close PDU @@ -298,7 +298,7 @@ - + Explain Zebra maintains a "classic" @@ -327,7 +327,7 @@ - + The SRU/SRW Server In addition to Z39.50, Zebra supports the more recent and @@ -370,7 +370,7 @@ the protocol packets and Zebra's support for them is equivalent. - + Running the SRU Server (zebrasrv) Because Zebra supports all three protocols on one port, it would @@ -467,7 +467,7 @@
- + SRU and SRW Protocol Support and Behavior Zebra running as an SRU server supports SRU version 1.1, including @@ -475,7 +475,7 @@ following elements of the protocol. - + Search and Retrieval Zebra fully supports SRU's core @@ -499,7 +499,7 @@ - + Scan Zebra supports SRU's @@ -518,7 +518,7 @@ - + Explain Zebra fully supports SRU's core @@ -552,7 +552,7 @@ - + Some SRU Examples Surf into http://localhost:9999 @@ -596,7 +596,7 @@ - + Initialization, Present, Sort, Close In the Z39.50 protocol, Initialization, Present, Sort and Close diff --git a/doc/zebra.xml b/doc/zebra.xml index 8c5dfe9..d68594c 100644 --- a/doc/zebra.xml +++ b/doc/zebra.xml @@ -9,7 +9,7 @@ %common; ]> - + Zebra - User's Guide and Reference @@ -69,6 +69,7 @@ &chap-administration; &chap-recordmodel-grs; &chap-recordmodel-alvisxslt; + &chap-field-structure; &chap-zebraidx; &chap-server; &app-license;