--- /dev/null
+//$Id: MarcXmlWriter.java,v 1.9 2008/10/17 19:11:49 haschart Exp $\r
+/**\r
+ * Copyright (C) 2004 Bas Peters\r
+ *\r
+ * This file is part of MARC4J\r
+ *\r
+ * MARC4J is free software; you can redistribute it and/or\r
+ * modify it under the terms of the GNU Lesser General Public \r
+ * License as published by the Free Software Foundation; either \r
+ * version 2.1 of the License, or (at your option) any later version.\r
+ *\r
+ * MARC4J is distributed in the hope that it will be useful,\r
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
+ * Lesser General Public License for more details.\r
+ *\r
+ * You should have received a copy of the GNU Lesser General Public \r
+ * License along with MARC4J; if not, write to the Free Software\r
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA\r
+ */\r
+package org.marc4j;\r
+\r
+import java.io.BufferedWriter;\r
+import java.io.IOException;\r
+import java.io.OutputStream;\r
+import java.io.OutputStreamWriter;\r
+import java.io.UnsupportedEncodingException;\r
+import java.io.Writer;\r
+import java.util.Iterator;\r
+\r
+import javax.xml.transform.OutputKeys;\r
+import javax.xml.transform.Result;\r
+import javax.xml.transform.Source;\r
+import javax.xml.transform.TransformerFactory;\r
+import javax.xml.transform.sax.SAXTransformerFactory;\r
+import javax.xml.transform.sax.TransformerHandler;\r
+import javax.xml.transform.stream.StreamResult;\r
+import javax.xml.transform.stream.StreamSource;\r
+\r
+import org.marc4j.converter.CharConverter;\r
+import org.marc4j.marc.ControlField;\r
+import org.marc4j.marc.DataField;\r
+import org.marc4j.marc.Leader;\r
+import org.marc4j.marc.Record;\r
+import org.marc4j.marc.Subfield;\r
+import org.xml.sax.SAXException;\r
+import org.xml.sax.helpers.AttributesImpl;\r
+\r
+import com.ibm.icu.text.Normalizer;\r
+\r
+/**\r
+ * Class for writing MARC record objects in MARCXML format. This class outputs a\r
+ * SAX event stream to the given {@link java.io.OutputStream} or\r
+ * {@link javax.xml.transform.Result} object. It can be used in a SAX\r
+ * pipeline to postprocess the result. By default this class uses a nulll\r
+ * transform. It is strongly recommended to use a dedicated XML serializer.\r
+ * \r
+ * <p>\r
+ * This class requires a JAXP compliant XML parser and XSLT processor. The\r
+ * underlying SAX2 parser should be namespace aware. In addition this class\r
+ * requires <a href="http://icu.sourceforge.net/">ICU4J </a> to perform Unicode\r
+ * normalization. A stripped down version of 2.6 originating from the <a\r
+ * href="http://www.cafeconleche.org/XOM/">XOM </a> project is included in this\r
+ * distribution.\r
+ * </p>\r
+ * <p>\r
+ * The following example reads a file with MARC records and writes MARCXML\r
+ * records in UTF-8 encoding to the console:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * \r
+ * InputStream input = new FileInputStream("input.mrc")\r
+ * MarcReader reader = new MarcStreamReader(input);\r
+ * \r
+ * MarcWriter writer = new MarcXmlWriter(System.out, true);\r
+ * while (reader.hasNext()) {\r
+ * Record record = reader.next();\r
+ * writer.write(record);\r
+ * }\r
+ * writer.close();\r
+ * \r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * To perform a character conversion like MARC-8 to UCS/Unicode register a\r
+ * <code>CharConverter</code>:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * writer.setConverter(new AnselToUnicode());\r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * In addition you can perform Unicode normalization. This is for example not\r
+ * done by the MARC-8 to UCS/Unicode converter. With Unicode normalization text\r
+ * is transformed into the canonical composed form. For example "a�bc"\r
+ * is normalized to "�bc". To perform normalization set Unicode\r
+ * normalization to true:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * writer.setUnicodeNormalization(true);\r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * Please note that it's not garanteed to work if you try to convert normalized\r
+ * Unicode back to MARC-8 encoding using\r
+ * {@link org.marc4j.converter.impl.UnicodeToAnsel}.\r
+ * </p>\r
+ * <p>\r
+ * This class provides very basic formatting options. For more advanced options\r
+ * create an instance of this class with a\r
+ * {@link javax.xml.transform.sax.SAXResult} containing a\r
+ * {@link org.xml.sax.ContentHandler} derived from a dedicated XML\r
+ * serializer.\r
+ * </p>\r
+ * \r
+ * <p>\r
+ * The following example uses\r
+ * <code>org.apache.xml.serialize.XMLSerializer</code> to write MARC records\r
+ * to XML using MARC-8 to UCS/Unicode conversion and Unicode normalization:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * \r
+ * InputStream input = new FileInputStream("input.mrc")\r
+ * MarcReader reader = new MarcStreamReader(input);\r
+ * \r
+ * OutputFormat format = new OutputFormat("xml","UTF-8", true);\r
+ * OutputStream out = new FileOutputStream("output.xml");\r
+ * XMLSerializer serializer = new XMLSerializer(out, format);\r
+ * Result result = new SAXResult(serializer.asContentHandler());\r
+ * \r
+ * MarcXmlWriter writer = new MarcXmlWriter(result);\r
+ * writer.setConverter(new AnselToUnicode());\r
+ * while (reader.hasNext()) {\r
+ * Record record = reader.next();\r
+ * writer.write(record);\r
+ * }\r
+ * writer.close();\r
+ * \r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * You can post-process the result using a <code>Source</code> object pointing\r
+ * to a stylesheet resource and a <code>Result</code> object to hold the\r
+ * transformation result tree. The example below converts MARC to MARCXML and\r
+ * transforms the result tree to MODS using the stylesheet provided by The\r
+ * Library of Congress:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * \r
+ * String stylesheetUrl = "http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl";\r
+ * Source stylesheet = new StreamSource(stylesheetUrl);\r
+ * \r
+ * Result result = new StreamResult(System.out);\r
+ * \r
+ * InputStream input = new FileInputStream("input.mrc")\r
+ * MarcReader reader = new MarcStreamReader(input);\r
+ * MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet);\r
+ * writer.setConverter(new AnselToUnicode());\r
+ * while (reader.hasNext()) {\r
+ * Record record = (Record) reader.next();\r
+ * writer.write(record);\r
+ * }\r
+ * writer.close();\r
+ * \r
+ * </pre>\r
+ * \r
+ * <p>\r
+ * It is also possible to write the result into a DOM Node:\r
+ * </p>\r
+ * \r
+ * <pre>\r
+ * \r
+ * InputStream input = new FileInputStream("input.mrc")\r
+ * MarcReader reader = new MarcStreamReader(input);\r
+ * DOMResult result = new DOMResult();\r
+ * MarcXmlWriter writer = new MarcXmlWriter(result);\r
+ * writer.setConverter(new AnselToUnicode());\r
+ * while (reader.hasNext()) {\r
+ * Record record = (Record) reader.next();\r
+ * writer.write(record);\r
+ * }\r
+ * writer.close();\r
+ * \r
+ * Document doc = (Document) result.getNode();\r
+ * \r
+ * </pre>\r
+ * \r
+ * @author Bas Peters\r
+ * @version $Revision: 1.9 $\r
+ * \r
+ */\r
+public class TurboMarcXmlWriter implements MarcWriter {\r
+\r
+ protected static final String CONTROL_FIELD = "c";\r
+\r
+ protected static final String DATA_FIELD = "d";\r
+\r
+ protected static final String SUBFIELD = "s";\r
+\r
+ protected static final String COLLECTION = "c";\r
+\r
+ protected static final String RECORD = "r";\r
+\r
+ protected static final String LEADER = "l";\r
+\r
+ private boolean indent = false;\r
+\r
+ private TransformerHandler handler = null;\r
+\r
+ private Writer writer = null;\r
+ \r
+ \r
+ /**\r
+ * Character encoding. Default is UTF-8.\r
+ */\r
+ //private String encoding = "UTF8";\r
+\r
+ private CharConverter converter = null;\r
+\r
+ private boolean normalize = false;\r
+\r
+ /**\r
+ * Constructs an instance with the specified output stream.\r
+ * \r
+ * The default character encoding for UTF-8 is used.\r
+ * \r
+ * @throws MarcException\r
+ */\r
+ public TurboMarcXmlWriter(OutputStream out) {\r
+ this(out, false);\r
+ }\r
+\r
+ /**\r
+ * Constructs an instance with the specified output stream and indentation.\r
+ * \r
+ * The default character encoding for UTF-8 is used.\r
+ * \r
+ * @throws MarcException\r
+ */\r
+ public TurboMarcXmlWriter(OutputStream out, boolean indent) {\r
+ this(out, "UTF8", indent);\r
+ }\r
+\r
+ /**\r
+ * Constructs an instance with the specified output stream and character\r
+ * encoding.\r
+ * \r
+ * @throws MarcException\r
+ */\r
+ public TurboMarcXmlWriter(OutputStream out, String encoding) {\r
+ this(out, encoding, false);\r
+ }\r
+\r
+ /**\r
+ * Constructs an instance with the specified output stream, character\r
+ * encoding and indentation.\r
+ * \r
+ * @throws MarcException\r
+ */\r
+ public TurboMarcXmlWriter(OutputStream out, String encoding, boolean indent) {\r
+ if (out == null) {\r
+ throw new NullPointerException("null OutputStream");\r
+ }\r
+ if (encoding == null) {\r
+ throw new NullPointerException("null encoding");\r
+ }\r
+ try {\r
+ setIndent(indent);\r
+ writer = new OutputStreamWriter(out, encoding);\r
+ writer = new BufferedWriter(writer);\r
+ // this.encoding = encoding;\r
+ setHandler(new StreamResult(writer), null);\r
+ } catch (UnsupportedEncodingException e) {\r
+ throw new MarcException(e.getMessage(), e);\r
+ }\r
+ writeStartDocument();\r
+ }\r
+\r
+ /**\r
+ * Constructs an instance with the specified result.\r
+ * \r
+ * @param result\r
+ * @throws SAXException\r
+ */\r
+ public TurboMarcXmlWriter(Result result) {\r
+ if (result == null)\r
+ throw new NullPointerException("null Result");\r
+ setHandler(result, null);\r
+ writeStartDocument();\r
+ }\r
+\r
+ /**\r
+ * Constructs an instance with the specified stylesheet location and result.\r
+ * \r
+ * @param result\r
+ * @throws SAXException\r
+ */\r
+ public TurboMarcXmlWriter(Result result, String stylesheetUrl) {\r
+ this(result, new StreamSource(stylesheetUrl));\r
+ }\r
+\r
+ /**\r
+ * Constructs an instance with the specified stylesheet source and result.\r
+ * \r
+ * @param result\r
+ * @throws SAXException\r
+ */\r
+ public TurboMarcXmlWriter(Result result, Source stylesheet) {\r
+ if (stylesheet == null)\r
+ throw new NullPointerException("null Source");\r
+ if (result == null)\r
+ throw new NullPointerException("null Result");\r
+ setHandler(result, stylesheet);\r
+ writeStartDocument();\r
+ }\r
+\r
+ public void close() {\r
+ writeEndDocument();\r
+ try {\r
+ writer.close();\r
+ } catch (IOException e) {\r
+ throw new MarcException(e.getMessage(), e);\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Returns the character converter.\r
+ * \r
+ * @return CharConverter the character converter\r
+ */\r
+ public CharConverter getConverter() {\r
+ return converter;\r
+ }\r
+\r
+ /**\r
+ * Sets the character converter.\r
+ * \r
+ * @param converter\r
+ * the character converter\r
+ */\r
+ public void setConverter(CharConverter converter) {\r
+ this.converter = converter;\r
+ }\r
+\r
+ /**\r
+ * If set to true this writer will perform Unicode normalization on data\r
+ * elements using normalization form C (NFC). The default is false.\r
+ * \r
+ * The implementation used is ICU4J 2.6. This version is based on Unicode\r
+ * 4.0.\r
+ * \r
+ * @param normalize\r
+ * true if this writer performs Unicode normalization, false\r
+ * otherwise\r
+ */\r
+ public void setUnicodeNormalization(boolean normalize) {\r
+ this.normalize = normalize;\r
+ }\r
+\r
+ /**\r
+ * Returns true if this writer will perform Unicode normalization, false\r
+ * otherwise.\r
+ * \r
+ * @return boolean - true if this writer performs Unicode normalization,\r
+ * false otherwise.\r
+ */\r
+ public boolean getUnicodeNormalization() {\r
+ return normalize;\r
+ }\r
+\r
+ protected void setHandler(Result result, Source stylesheet)\r
+ throws MarcException {\r
+ try {\r
+ TransformerFactory factory = TransformerFactory.newInstance();\r
+ if (!factory.getFeature(SAXTransformerFactory.FEATURE))\r
+ throw new UnsupportedOperationException(\r
+ "SAXTransformerFactory is not supported");\r
+\r
+ SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory;\r
+ if (stylesheet == null)\r
+ handler = saxFactory.newTransformerHandler();\r
+ else\r
+ handler = saxFactory.newTransformerHandler(stylesheet);\r
+ handler.getTransformer()\r
+ .setOutputProperty(OutputKeys.METHOD, "xml");\r
+ handler.setResult(result);\r
+\r
+ } catch (Exception e) {\r
+ throw new MarcException(e.getMessage(), e);\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Writes the root start tag to the result.\r
+ * \r
+ * @throws SAXException\r
+ */\r
+ protected void writeStartDocument() {\r
+ try {\r
+ AttributesImpl atts = new AttributesImpl();\r
+ handler.startDocument();\r
+ // The next line duplicates the namespace declaration for Marc XML\r
+ // handler.startPrefixMapping("", Constants.MARCXML_NS_URI);\r
+ // add namespace declaration using attribute - need better solution\r
+ atts.addAttribute(Constants.TURBO_MARCXML_NS_URI, "xmlns", "xmlns",\r
+ "CDATA", Constants.TURBO_MARCXML_NS_URI); \r
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION, COLLECTION, atts);\r
+ } catch (SAXException e) {\r
+ throw new MarcException(\r
+ "SAX error occured while writing start document", e);\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Writes the root end tag to the result.\r
+ * \r
+ * @throws SAXException\r
+ */\r
+ protected void writeEndDocument() {\r
+ try {\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n".toCharArray(), 0, 1);\r
+\r
+ handler\r
+ .endElement(Constants.TURBO_MARCXML_NS_URI, COLLECTION,\r
+ COLLECTION);\r
+ handler.endPrefixMapping("");\r
+ handler.endDocument();\r
+ } catch (SAXException e) {\r
+ throw new MarcException(\r
+ "SAX error occured while writing end document", e);\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Writes a Record object to the result.\r
+ * \r
+ * @param record -\r
+ * the <code>Record</code> object\r
+ * @throws SAXException\r
+ */\r
+ public void write(Record record) {\r
+ try {\r
+ toXml(record);\r
+ } catch (SAXException e) {\r
+ throw new MarcException("SAX error occured while writing record", e);\r
+ }\r
+ }\r
+\r
+ /**\r
+ * Returns true if indentation is active, false otherwise.\r
+ * \r
+ * @return boolean\r
+ */\r
+ public boolean hasIndent() {\r
+ return indent;\r
+ }\r
+\r
+ /**\r
+ * Activates or deactivates indentation. Default value is false.\r
+ * \r
+ * @param indent\r
+ */\r
+ public void setIndent(boolean indent) {\r
+ this.indent = indent;\r
+ }\r
+\r
+ protected void toXml(Record record) throws SAXException {\r
+ char temp[];\r
+ AttributesImpl atts = new AttributesImpl();\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);\r
+\r
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD, atts);\r
+\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);\r
+\r
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER, atts);\r
+ Leader leader = record.getLeader();\r
+ temp = leader.toString().toCharArray();\r
+ handler.characters(temp, 0, temp.length);\r
+ handler.endElement(Constants.TURBO_MARCXML_NS_URI, LEADER, LEADER);\r
+\r
+ Iterator<ControlField> ci = record.getControlFields().iterator();\r
+ while (ci.hasNext()) {\r
+ ControlField field = (ControlField) ci.next();\r
+ atts = new AttributesImpl();\r
+ //atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());\r
+\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);\r
+ String elementName = CONTROL_FIELD + field.getTag();\r
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName, elementName, atts);\r
+ temp = getDataElement(field.getData());\r
+ handler.characters(temp, 0, temp.length);\r
+ handler.endElement(Constants.TURBO_MARCXML_NS_URI, elementName, elementName);\r
+ }\r
+\r
+ Iterator<DataField> di = record.getDataFields().iterator();\r
+ while (di.hasNext()) {\r
+ DataField field = di.next();\r
+ atts = new AttributesImpl();\r
+ // atts.addAttribute("", "tag", "tag", "CDATA", field.getTag());\r
+ atts.addAttribute("", "ind1", "ind1", "CDATA", String.valueOf(field\r
+ .getIndicator1()));\r
+ atts.addAttribute("", "ind2", "ind2", "CDATA", String.valueOf(field\r
+ .getIndicator2()));\r
+\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);\r
+ StringBuffer elementName = new StringBuffer(DATA_FIELD);\r
+ elementName.append(field.getTag());\r
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(), elementName.toString(), atts);\r
+\r
+ Iterator<Subfield> si = field.getSubfields().iterator();\r
+ while (si.hasNext()) {\r
+ Subfield subfield = (Subfield) si.next();\r
+ StringBuffer subfieldName = new StringBuffer(SUBFIELD); \r
+ \r
+ char code = subfield.getCode(); \r
+ // if [a-zA-Z0-9] append to elementName, otherwise use a attribute\r
+ if (code >= '0' && code <= '9' ||\r
+ code >= 'a' && code <= 'z' ||\r
+ code >= 'A' && code <= 'Z') {\r
+ subfieldName.append(code);\r
+ }\r
+ else {\r
+ atts = new AttributesImpl();\r
+ atts.addAttribute("", "code", "code", "CDATA", String\r
+ .valueOf(subfield.getCode()));\r
+ }\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 7);\r
+\r
+ handler.startElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(),\r
+ subfieldName.toString(), atts);\r
+ temp = getDataElement(subfield.getData());\r
+ handler.characters(temp, 0, temp.length);\r
+ handler\r
+ .endElement(Constants.TURBO_MARCXML_NS_URI, subfieldName.toString(),\r
+ subfieldName.toString());\r
+ }\r
+\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 5);\r
+\r
+ handler\r
+ .endElement(Constants.TURBO_MARCXML_NS_URI, elementName.toString(), elementName.toString());\r
+ }\r
+\r
+ if (indent)\r
+ handler.ignorableWhitespace("\n ".toCharArray(), 0, 3);\r
+\r
+ handler.endElement(Constants.TURBO_MARCXML_NS_URI, RECORD, RECORD);\r
+ }\r
+\r
+ protected char[] getDataElement(String data) {\r
+ String dataElement = null;\r
+ if (converter == null)\r
+ return data.toCharArray();\r
+ dataElement = converter.convert(data);\r
+ if (normalize)\r
+ dataElement = Normalizer.normalize(dataElement, Normalizer.NFC);\r
+ return dataElement.toCharArray();\r
+ }\r
+}
\ No newline at end of file