Turbo Marc Writer

author Dennis Schafroth <dennis@indexdata.com>

Wed, 18 Jan 2012 11:53:35 +0000 (12:53 +0100)

committer Dennis Schafroth <dennis@indexdata.com>

Thu, 16 Feb 2012 13:32:42 +0000 (14:32 +0100)
author Dennis Schafroth <dennis@indexdata.com>
Wed, 18 Jan 2012 11:53:35 +0000 (12:53 +0100)
committer Dennis Schafroth <dennis@indexdata.com>
Thu, 16 Feb 2012 13:32:42 +0000 (14:32 +0100)
diff --git a/src/org/marc4j/TurboMarcXmlWriter.java b/src/org/marc4j/TurboMarcXmlWriter.java

new file mode 100644 (file)

index 0000000..789410d
--- /dev/null
+++ b/src/org/marc4j/TurboMarcXmlWriter.java
@@ -0,0 +1,572 @@
+//$Id: MarcXmlWriter.java,v 1.9 2008/10/17 19:11:49 haschart Exp $\r
+/**\r
+ * Copyright (C) 2004 Bas Peters\r
+ *\r
+ * This file is part of MARC4J\r
+ *\r
+ * MARC4J is free software; you can redistribute it and/or\r
+ * modify it under the terms of the GNU Lesser General Public \r
+ * License as published by the Free Software Foundation; either \r
+ * version 2.1 of the License, or (at your option) any later version.\r
+ *\r
+ * MARC4J is distributed in the hope that it will be useful,\r
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
+ * Lesser General Public License for more details.\r
+ *\r
+ * You should have received a copy of the GNU Lesser General Public \r
+ * License along with MARC4J; if not, write to the Free Software\r
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA\r
+ */\r
+package org.marc4j;\r
+\r
+import java.io.BufferedWriter;\r
+import java.io.IOException;\r
+import java.io.OutputStream;\r
+import java.io.OutputStreamWriter;\r
+import java.io.UnsupportedEncodingException;\r
+import java.io.Writer;\r
+import java.util.Iterator;\r
+\r
+import javax.xml.transform.OutputKeys;\r
+import javax.xml.transform.Result;\r
+import javax.xml.transform.Source;\r
+import javax.xml.transform.TransformerFactory;\r
+import javax.xml.transform.sax.SAXTransformerFactory;\r
+import javax.xml.transform.sax.TransformerHandler;\r
+import javax.xml.transform.stream.StreamResult;\r
+import javax.xml.transform.stream.StreamSource;\r
+\r
+import org.marc4j.converter.CharConverter;\r
+import org.marc4j.marc.ControlField;\r
+import org.marc4j.marc.DataField;\r
+import org.marc4j.marc.Leader;\r
+import org.marc4j.marc.Record;\r
+import org.marc4j.marc.Subfield;\r
+import org.xml.sax.SAXException;\r
+import org.xml.sax.helpers.AttributesImpl;\r
+\r
+import com.ibm.icu.text.Normalizer;\r
+\r
+/**\r
+ * Class for writing MARC record objects in MARCXML format. This class outputs a\r
+ * SAX event stream to the given {@link java.io.OutputStream}&nbsp; or\r
+ * {@link javax.xml.transform.Result}&nbsp;object. It can be used in a SAX\r
+ * pipeline to postprocess the result. By default this class uses a nulll\r
+ * transform. It is strongly recommended to use a dedicated XML serializer.\r
+ * \r
+ * <p>\r
+ * This class requires a JAXP compliant XML parser and XSLT processor. The\r
+ * underlying SAX2 parser should be namespace aware. In addition this class\r
+ * requires <a href="http://icu.sourceforge.net/">ICU4J </a> to perform Unicode\r
+ * normalization. A stripped down version of 2.6 originating from the <a\r
+ * href="http://www.cafeconleche.org/XOM/">XOM </a> project is included in this\r
+ * distribution.\r
+ * </p>\r
+ * <p>\r
+ * The following example reads a file with MARC records and writes MARCXML\r
+ * records in UTF-8 encoding to the console:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ *  \r
+ *      InputStream input = new FileInputStream(&quot;input.mrc&quot;)\r
+ *      MarcReader reader = new MarcStreamReader(input);\r
+ *              \r
+ *      MarcWriter writer = new MarcXmlWriter(System.out, true);\r
+ *      while (reader.hasNext()) {\r
+ *          Record record = reader.next();\r
+ *          writer.write(record);\r
+ *      }\r
+ *      writer.close();\r
+ *   \r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * To perform a character conversion like MARC-8 to UCS/Unicode register a\r
+ * <code>CharConverter</code>:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * writer.setConverter(new AnselToUnicode());\r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * In addition you can perform Unicode normalization. This is for example not\r
+ * done by the MARC-8 to UCS/Unicode converter. With Unicode normalization text\r
+ * is transformed into the canonical composed form. For example &quot;a�bc&quot;\r
+ * is normalized to &quot;�bc&quot;. To perform normalization set Unicode\r
+ * normalization to true:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * writer.setUnicodeNormalization(true);\r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * Please note that it's not garanteed to work if you try to convert normalized\r
+ * Unicode back to MARC-8 encoding using\r
+ * {@link org.marc4j.converter.impl.UnicodeToAnsel}.\r
+ * </p>\r
+ * <p>\r
+ * This class provides very basic formatting options. For more advanced options\r
+ * create an instance of this class with a\r
+ * {@link javax.xml.transform.sax.SAXResult}&nbsp;containing a\r
+ * {@link org.xml.sax.ContentHandler}&nbsp;derived from a dedicated XML\r
+ * serializer.\r
+ * </p>\r
+ * \r
+ * <p>\r
+ * The following example uses\r
+ * <code>org.apache.xml.serialize.XMLSerializer</code> to write MARC records\r
+ * to XML using MARC-8 to UCS/Unicode conversion and Unicode normalization:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ *  \r
+ *      InputStream input = new FileInputStream(&quot;input.mrc&quot;)\r
+ *      MarcReader reader = new MarcStreamReader(input);\r
+ *                \r
+ *      OutputFormat format = new OutputFormat(&quot;xml&quot;,&quot;UTF-8&quot;, true);\r
+ *      OutputStream out = new FileOutputStream(&quot;output.xml&quot;);\r
+ *      XMLSerializer serializer = new XMLSerializer(out, format);\r
+ *      Result result = new SAXResult(serializer.asContentHandler());\r
+ *                \r
+ *      MarcXmlWriter writer = new MarcXmlWriter(result);\r
+ *      writer.setConverter(new AnselToUnicode());\r
+ *      while (reader.hasNext()) {\r
+ *          Record record = reader.next();\r
+ *          writer.write(record);\r
+ *      }\r
+ *      writer.close();\r
+ *   \r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * You can post-process the result using a <code>Source</code> object pointing\r
+ * to a stylesheet resource and a <code>Result</code> object to hold the\r
+ * transformation result tree. The example below converts MARC to MARCXML and\r
+ * transforms the result tree to MODS using the stylesheet provided by The\r
+ * Library of Congress:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ *  \r
+ *      String stylesheetUrl = &quot;http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl&quot;;\r
+ *      Source stylesheet = new StreamSource(stylesheetUrl);\r
+ *         \r
+ *      Result result = new StreamResult(System.out);\r
+ *            \r
+ *      InputStream input = new FileInputStream(&quot;input.mrc&quot;)\r
+ *      MarcReader reader = new MarcStreamReader(input);\r
+ *      MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet);\r
+ *      writer.setConverter(new AnselToUnicode());\r
+ *      while (reader.hasNext()) {\r
+ *          Record record = (Record) reader.next();\r
+ *          writer.write(record);\r
+ *      }\r
+ *      writer.close();\r
+ *   \r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * It is also possible to write the result into a DOM Node:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ *  \r
+ *      InputStream input = new FileInputStream(&quot;input.mrc&quot;)\r
+ *      MarcReader reader = new MarcStreamReader(input);\r
+ *      DOMResult result = new DOMResult();\r
+ *      MarcXmlWriter writer = new MarcXmlWriter(result);\r
+ *      writer.setConverter(new AnselToUnicode());\r
+ *      while (reader.hasNext()) {\r
+ *          Record record = (Record) reader.next();\r
+ *          writer.write(record);\r
+ *      }\r
+ *      writer.close();\r
+ *         \r
+ *      Document doc = (Document) result.getNode();\r
+ *   \r
+ * </pre>\r
+ * \r
+ * @author Bas Peters\r
+ * @version $Revision: 1.9 $\r
+ * \r
+ */\r
+public class TurboMarcXmlWriter implements MarcWriter {\r
+\r
+    protected static final String CONTROL_FIELD = "c";\r
+\r
+    protected static final String DATA_FIELD = "d";\r
+\r
+    protected static final String SUBFIELD = "s";\r
+\r
+    protected static final String COLLECTION = "c";\r
+\r
+    protected static final String RECORD = "r";\r
+\r
+    protected static final String LEADER = "l";\r
+\r
+    private boolean indent = false;\r
+\r
+    private TransformerHandler handler = null;\r
+\r
+    private Writer writer = null;\r
+    \r
+    \r
+    /**\r
+     * Character encoding. Default is UTF-8.\r
+     */\r
+    //private String encoding = "UTF8";\r
+\r
+    private CharConverter converter = null;\r
+\r
+    private boolean normalize = false;\r
+\r
+    /**\r
+     * Constructs an instance with the specified output stream.\r
+     * \r
+     * The default character encoding for UTF-8 is used.\r
+     *      \r
+     * @throws MarcException\r
+     */\r
+    public TurboMarcXmlWriter(OutputStream out) {\r
+        this(out, false);\r
+    }\r
+\r
+    /**\r
+     * Constructs an instance with the specified output stream and indentation.\r
+     * \r
+     * The default character encoding for UTF-8 is used.\r
+     * \r
+     * @throws MarcException\r
+     */\r
+    public TurboMarcXmlWriter(OutputStream out, boolean indent) {\r
+        this(out, "UTF8", indent);\r
+    }\r
+\r
+    /**\r
+     * Constructs an instance with the specified output stream and character\r
+     * encoding.\r
+     * \r
+     * @throws MarcException\r
+     */\r
+    public TurboMarcXmlWriter(OutputStream out, String encoding) {\r
+        this(out, encoding, false);\r
+    }\r
+\r
+    /**\r
+     * Constructs an instance with the specified output stream, character\r
+     * encoding and indentation.\r
+     * \r
+     * @throws MarcException\r
+     */\r
+    public TurboMarcXmlWriter(OutputStream out, String encoding, boolean indent) {\r
+        if (out == null) {\r
+            throw new NullPointerException("null OutputStream");\r
+        }\r
+        if (encoding == null) {\r
+            throw new NullPointerException("null encoding");\r
+        }\r
+        try {\r
+            setIndent(indent);\r
+            writer = new OutputStreamWriter(out, encoding);\r
+            writer = new BufferedWriter(writer);\r
+            // this.encoding = encoding;\r
+            setHandler(new StreamResult(writer), null);\r
+        } catch (UnsupportedEncodingException e) {\r
+            throw new MarcException(e.getMessage(), e);\r
+        }\r
+        writeStartDocument();\r
+    }\r
+\r
+    /**\r
+     * Constructs an instance with the specified result.\r
+     * \r
+     * @param result\r
+     * @throws SAXException\r
+     */\r
+    public TurboMarcXmlWriter(Result result) {\r
+        if (result == null)\r
+            throw new NullPointerException("null Result");\r
+        setHandler(result, null);\r
+        writeStartDocument();\r
+    }\r
+\r
+    /**\r
+     * Constructs an instance with the specified stylesheet location and result.\r
+     * \r
+     * @param result\r
+     * @throws SAXException\r
+     */\r
+    public TurboMarcXmlWriter(Result result, String stylesheetUrl) {\r
+        this(result, new StreamSource(stylesheetUrl));\r
+    }\r
+\r
+    /**\r
+     * Constructs an instance with the specified stylesheet source and result.\r
+     * \r
+     * @param result\r
+     * @throws SAXException\r
+     */\r
+    public TurboMarcXmlWriter(Result result, Source stylesheet) {\r
+        if (stylesheet == null)\r
+            throw new NullPointerException("null Source");\r
+        if (result == null)\r
+            throw new NullPointerException("null Result");\r
+        setHandler(result, stylesheet);\r
+        writeStartDocument();\r
+    }\r
+\r
+    public void close() {\r
+       writeEndDocument();\r
+       try {\r
+               writer.close();\r
+       } catch (IOException e) {\r
+               throw new MarcException(e.getMessage(), e);\r
+       }\r
+    }\r
+\r
+    /**\r
+     * Returns the character converter.\r
+     * \r
+     * @return CharConverter the character converter\r
+     */\r
+    public CharConverter getConverter() {\r
+        return converter;\r
+    }\r
+\r
+    /**\r
+     * Sets the character converter.\r
+     * \r
+     * @param converter\r
+     *            the character converter\r
+     */\r
+    public void setConverter(CharConverter converter) {\r
+        this.converter = converter;\r
+    }\r
+\r
+    /**\r
+     * If set to true this writer will perform Unicode normalization on data\r
+     * elements using normalization form C (NFC). The default is false.\r
+     * \r
+     * The implementation used is ICU4J 2.6. This version is based on Unicode\r
+     * 4.0.\r
+     * \r
+     * @param normalize\r
+     *            true if this writer performs Unicode normalization, false\r
+     *            otherwise\r
+     */\r
+    public void setUnicodeNormalization(boolean normalize) {\r
+        this.normalize = normalize;\r
+    }\r
+\r
+    /**\r
+     * Returns true if this writer will perform Unicode normalization, false\r
+     * otherwise.\r
+     * \r
+     * @return boolean - true if this writer performs Unicode normalization,\r
+     *         false otherwise.\r
+     */\r
+    public boolean getUnicodeNormalization() {\r
+        return normalize;\r
+    }\r
+\r
+    protected void setHandler(Result result, Source stylesheet)\r
+            throws MarcException {\r
+        try {\r
+            TransformerFactory factory = TransformerFactory.newInstance();\r
+            if (!factory.getFeature(SAXTransformerFactory.FEATURE))\r
+                throw new UnsupportedOperationException(\r
+                        "SAXTransformerFactory is not supported");\r
+\r
+            SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory;\r
+            if (stylesheet == null)\r
+                handler = saxFactory.newTransformerHandler();\r
+            else\r
+                handler = saxFactory.newTransformerHandler(stylesheet);\r
+            handler.getTransformer()\r
+                    .setOutputProperty(OutputKeys.METHOD, "xml");\r
+            handler.setResult(result);\r
+\r
+        } catch (Exception e) {\r
+            throw new MarcException(e.getMessage(), e);\r
+        }\r
+    }\r
+\r
+    /**\r
+     * Writes the root start tag to the result.\r
+     * \r
+     * @throws SAXException\r
+     */\r
+    protected void writeStartDocument() {\r
+        try {\r
+            AttributesImpl atts = new AttributesImpl();\r
+            handler.startDocument();\r
+            // The next line duplicates the namespace declaration for Marc XML\r
+            // handler.startPrefixMapping("", Constants.MARCXML_NS_URI);\r
+            // add namespace declaration using attribute - need better solution\r
+            atts.addAttribute(Constants.TURBO_MARCXML_NS_URI, "xmlns", "xmlns",\r
+                              "CDATA", Constants.TURBO_MARCXML_NS_URI);            \r
+            handler.startElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, COLLECTION, atts);\r
+        } catch (SAXException e) {\r
+            throw new MarcException(\r
+                    "SAX error occured while writing start document", e);\r
+        }\r
+    }\r
+\r
+    /**\r
+     * Writes the root end tag to the result.\r
+     * \r
+     * @throws SAXException\r
+     */\r
+    protected void writeEndDocument() {\r
+        try {\r
+            if (indent)\r
+                handler.ignorableWhitespace("\n".toCharArray(), 0, 1);\r
+\r
+            handler\r
+                    .endElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION,\r
+                            COLLECTION);\r
+            handler.endPrefixMapping("");\r
+            handler.endDocument();\r
+        } catch (SAXException e) {\r
+            throw new MarcException(\r
+                    "SAX error occured while writing end document", e);\r
+        }\r
+    }\r
+\r
+    /**\r
+     * Writes a Record object to the result.\r
+     * \r
+     * @param record -\r
+     *            the <code>Record</code> object\r
+     * @throws SAXException\r
+     */\r
+    public void write(Record record) {\r
+        try {\r
+            toXml(record);\r
+        } catch (SAXException e) {\r
+            throw new MarcException("SAX error occured while writing record", e);\r
+        }\r
+    }\r
+\r
+    /**\r
+     * Returns true if indentation is active, false otherwise.\r
+     * \r
+     * @return boolean\r
+     */\r
+    public boolean hasIndent() {\r
+        return indent;\r
+    }\r
+\r
+    /**\r
+     * Activates or deactivates indentation. Default value is false.\r
+     * \r
+     * @param indent\r
+     */\r
+    public void setIndent(boolean indent) {\r
+        this.indent = indent;\r
+    }\r
+\r
+    protected void toXml(Record record) throws SAXException {\r
+        char temp[];\r
+        AttributesImpl atts = new AttributesImpl();\r
+        if (indent)\r
+            handler.ignorableWhitespace("\n  ".toCharArray(), 0, 3);\r
+\r
+        handler.startElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD, atts);\r
+\r
+        if (indent)\r
+            handler.ignorableWhitespace("\n    ".toCharArray(), 0, 5);\r
+\r
+        handler.startElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER, atts);\r
+        Leader leader = record.getLeader();\r
+        temp = leader.toString().toCharArray();\r
+        handler.characters(temp, 0, temp.length);\r
+        handler.endElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER);\r
+\r
+        Iterator<ControlField> ci = record.getControlFields().iterator();\r
+        while (ci.hasNext()) {\r
+            ControlField field = (ControlField) ci.next();\r
+            atts = new AttributesImpl();\r
+            //atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());\r
+\r
+            if (indent)\r
+                handler.ignorableWhitespace("\n    ".toCharArray(), 0, 5);\r
+            String elementName = CONTROL_FIELD + field.getTag();\r
+            handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName, elementName, atts);\r
+            temp = getDataElement(field.getData());\r
+            handler.characters(temp, 0, temp.length);\r
+            handler.endElement(Constants.TURBO_MARCXML_NS_URI, elementName, elementName);\r
+        }\r
+\r
+        Iterator<DataField> di = record.getDataFields().iterator();\r
+        while (di.hasNext()) {\r
+            DataField field = di.next();\r
+            atts = new AttributesImpl();\r
+            // atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());\r
+            atts.addAttribute("", "ind1", "ind1", "CDATA", String.valueOf(field\r
+                    .getIndicator1()));\r
+            atts.addAttribute("", "ind2", "ind2", "CDATA", String.valueOf(field\r
+                    .getIndicator2()));\r
+\r
+            if (indent)\r
+                handler.ignorableWhitespace("\n    ".toCharArray(), 0, 5);\r
+            StringBuffer elementName = new StringBuffer(DATA_FIELD);\r
+            elementName.append(field.getTag());\r
+            handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(), elementName.toString(), atts);\r
+\r
+            Iterator<Subfield> si = field.getSubfields().iterator();\r
+            while (si.hasNext()) {\r
+                Subfield subfield = (Subfield) si.next();\r
+                StringBuffer subfieldName = new StringBuffer(SUBFIELD); \r
+  \r
+                char code = subfield.getCode(); \r
+                // if [a-zA-Z0-9] append to  elementName, otherwise use a attribute\r
+                if (code >= '0' && code <= '9' ||\r
+                    code >= 'a' && code <= 'z' ||\r
+                    code >= 'A' && code <= 'Z') {\r
+                  subfieldName.append(code);\r
+                }\r
+                else {\r
+                  atts = new AttributesImpl();\r
+                  atts.addAttribute("", "code", "code", "CDATA", String\r
+                      .valueOf(subfield.getCode()));\r
+                }\r
+                if (indent)\r
+                    handler.ignorableWhitespace("\n      ".toCharArray(), 0, 7);\r
+\r
+                handler.startElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(),\r
+                    subfieldName.toString(), atts);\r
+                temp = getDataElement(subfield.getData());\r
+                handler.characters(temp, 0, temp.length);\r
+                handler\r
+                        .endElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(),\r
+                            subfieldName.toString());\r
+            }\r
+\r
+            if (indent)\r
+                handler.ignorableWhitespace("\n    ".toCharArray(), 0, 5);\r
+\r
+            handler\r
+                    .endElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(), elementName.toString());\r
+        }\r
+\r
+        if (indent)\r
+            handler.ignorableWhitespace("\n  ".toCharArray(), 0, 3);\r
+\r
+        handler.endElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD);\r
+    }\r
+\r
+    protected char[] getDataElement(String data) {\r
+        String dataElement = null;\r
+        if (converter == null)\r
+            return data.toCharArray();\r
+        dataElement = converter.convert(data);\r
+        if (normalize)\r
+            dataElement = Normalizer.normalize(dataElement, Normalizer.NFC);\r
+        return dataElement.toCharArray();\r
+    }\r
+}
\ No newline at end of file
author	Dennis Schafroth <dennis@indexdata.com>
	Wed, 18 Jan 2012 11:53:35 +0000 (12:53 +0100)
committer	Dennis Schafroth <dennis@indexdata.com>
	Thu, 16 Feb 2012 13:32:42 +0000 (14:32 +0100)