Implementation as JDOM

/*--

 Copyright 2000, 2001 Elliotte Rusty Harold.
 All rights reserved.

 I haven't yet decided on a license.
 It will be some form of open source.

 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGE.

 */

package com.macfaq.xml;

import java.net.URL;
import java.net.URLConnection;
import java.net.MalformedURLException;
import java.util.Stack;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.InputStream;

import org.jdom.Namespace;
import org.jdom.Comment;
import org.jdom.CDATA;
import org.jdom.Text;
import org.jdom.JDOMException;
import org.jdom.Attribute;
import org.jdom.Element;
import org.jdom.ProcessingInstruction;
import org.jdom.Document;
import org.jdom.DocType;
import org.jdom.EntityRef;
import org.jdom.input.SAXBuilder;
import org.jdom.input.DOMBuilder;
import org.jdom.output.XMLOutputter;

/**
 * <p><code>JDOMXIncluder</code> provides methods to
 * resolve JDOM elements and documents to produce
 * a new <code>Document</code>, <code>Element</code>, 
 * or <code>List</code> of nodes with all
 * XInclude references resolved.
 * </p>
 *
 * <p>
 * Known bugs include:
 * </p>
 * <ul>
 *  <li>XPointer fragment identifiers are not handled</li>
 *  <li>Notations and unparsed entities from the included infosets
 *      are not merged into the final infoset</li>
 * </ul>
 *
 * @author Elliotte Rusty Harold
 * @version 1.0d8, September 18, 2001
 */
public class JDOMXIncluder {

  public final static Namespace XINCLUDE_NAMESPACE
    = Namespace.getNamespace("xi", "http://www.w3.org/2001/XInclude");

  // No instances allowed
  private JDOMXIncluder() {}

  private static SAXBuilder builder = new SAXBuilder();

  /**
    * <p>
    * This method resolves a JDOM <code>Document</code>
    * and merges in all XInclude references.
    * The <code>Document</code> object returned is a new document.
    * The original <code>Document</code> is not changed.
    * </p>
    *
    * @param original <code>Document</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 document includes an <code>xml:base</code> attribute.
    * @return Document new <code>Document</code> object in which all
    *                  XInclude elements have been replaced.
    * @throws MissingHrefException if an <code>xinclude:include</code> element does not have an href attribute.
    * @throws UnavailableResourceException if an included document cannot be located
    *                                  or cannot be read.
    * @throws MalformedResourceException if an included document is not namespace well-formed
    * @throws CircularIncludeException if this document possesses a cycle of
    *                                  XIncludes.
    * @throws XIncludeException if any of the rules of XInclude are violated
    */
    public static Document resolve(Document original, String base)
      throws XIncludeException {

        if (original == null) {
           throw new NullPointerException("Document must not be null");
        }
        
        Document result = (Document) original.clone();
        
        Element root = result.getRootElement();
        List resolved = resolve(root, base);
        
        // check that the list returned contains 
        // exactly one root element
        Element newRoot = null;
        Iterator iterator = resolved.iterator();
        while (iterator.hasNext()) {
            Object o = iterator.next();
            if (o instanceof Element) {
                if (newRoot != null) {
                    throw new XIncludeException("Tried to include multiple roots");       
                }
                newRoot = (Element) o;
            }
            else if (o instanceof Comment || o instanceof ProcessingInstruction) {
                // do nothing    
            }
            else if (o instanceof Text || o instanceof String) {
                throw new XIncludeException(
                  "Tried to include text node outside of root element"
                );    
            }
            else if (o instanceof EntityRef) {
                throw new XIncludeException(
                  "Tried to include a general entity reference outside of root element"
                );    
            }
            else {
                throw new XIncludeException(
                    "Unexpected type " + o.getClass()
                ); 
            }
                 
        }
        
        if (newRoot == null) {
            throw new XIncludeException("No root element");       
        }
  
        // Could probably combine two loops
        List newContent = result.getContent();
        // resolved contains list of new content
        // use it to replace old root element
        iterator = resolved.iterator();
        
        // put in nodes before root element
        int rootPosition = newContent.indexOf(result.getRootElement());
        while (iterator.hasNext()) {
            Object o = iterator.next();
            if (o instanceof Comment || o instanceof ProcessingInstruction) {
                newContent.add(rootPosition, o);
                rootPosition++;
            }
            else if (o instanceof Element) { // the root
                break;
            }
            else {
              // throw exception????   
            }
        }
        
        // put in root element
        result.setRootElement(newRoot);
        
        int addPosition = rootPosition+1;
        // put in nodes after root element
        while (iterator.hasNext()) {
            Object o = iterator.next();
            if (o instanceof Comment || o instanceof ProcessingInstruction) {
                newContent.add(addPosition, o);
                addPosition++;
            }
            else {
              // throw exception????   
            }
        }
                        
        return result;
  }

  /**
    * <p>
    * This method resolves a JDOM <code>Element</code>
    * and merges in all XInclude references. This process is recursive.
    * The element returned contains no XInclude elements.
    * If a referenced document cannot be found it is replaced with
    * an error message. The <code>Element</code> object returned is a new element.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param base     <code>String</code> form of the base URI against which
    *                 relative URLs will be resolved. This can be null if the
    *                 element includes an <code>xml:base</code> attribute.
    * @return List  A List containing all nodes that replace this element.
    *               If this element is not an <code>xinclude:include</code>
    *               this list is guaranteed to contain a single <code>Element</code> object.
    * @throws MissingHrefException if an <code>xinclude:include</code> element does not have an href attribute.
    * @throws NullPointerException if <code>original</code> element is null.
    * @throws UnavailableResourceException if an included document cannot be located
    *                                  or cannot be read.
    * @throws MalformedResourceException if an included document is not namespace well-formed
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which 
    *                                  this element is directly or indirectly included.
    */
    public static List resolve(Element original, String base)
     throws CircularIncludeException, XIncludeException, NullPointerException {

        if (original == null) {
          throw new NullPointerException("You can't XInclude a null element.");
        }
        Stack bases = new Stack();
        if (base != null) bases.push(base);
    
        List result = resolve(original, bases);
        bases.pop();
        return result;

    }

    private static boolean isIncludeElement(Element element) {
        
        if (element.getName().equals("include") &&
            element.getNamespace().equals(XINCLUDE_NAMESPACE)) {
          return true;
        }
        return false;
        
    }


  /**
    * <p>
    * This method resolves a JDOM <code>Element</code>
    * and merges in all XInclude references. This process is recursive.
    * The list returned contains no XInclude elements.
    * The nodes in the list returned are new objects.
    * The original <code>Element</code> is not changed.
    * </p>
    *
    * @param original <code>Element</code> that will be processed
    * @param bases    <code>Stack</code> containing the string forms of
    *                 all the URIs of documents which contain this element
    *                 through XIncludes. This is used to detect if any circular 
    *                 references occur. 
    * @return List  A <code>List</code> containing all nodes that replace this element.
    *               If this element is not an <code>xinclude:include</code>
    *               this list is guaranteed to contain a single <code>Element</code> object.
    * @throws MissingHrefException if an <code>xinclude:include</code> element does not have an href attribute.
    * @throws UnavailableResourceException if an included document cannot be located
    *                                  or cannot be read.
    * @throws BadParseAttributeException if an <code>include</code> element has a <code>parse</code> attribute
                                         with any value other than <code>text</code> or <code>parse</code>
    * @throws MalformedResourceException if an included document is not namespace well-formed
    * @throws CircularIncludeException if this <code>Element</code> contains an XInclude element
    *                                  that attempts to include a document in which 
    *                                  this element is directly or indirectly included.
    */
    protected static List resolve(Element original, Stack bases)
      throws CircularIncludeException, MalformedResourceException, 
      UnavailableResourceException, BadParseAttributeException, XIncludeException {

        String base = "";
        if (bases.size() != 0) base = (String) bases.peek();
  
        if (isIncludeElement(original)) {
            return resolveXIncludeElement(original, bases);       
        }
        else {
            Element resolvedElement = resolveNonXIncludeElement(original, bases);        
            List resultList = new LinkedList();
            resultList.add(resolvedElement);
            return resultList;
        }
  
    }

    private static List resolveXIncludeElement(Element original, Stack bases)
      throws CircularIncludeException, MalformedResourceException, 
      UnavailableResourceException, XIncludeException {

        String base = "";
        if (bases.size() != 0) base = (String) bases.peek();
  
        // These lines are probably unnecessary
        if (!isIncludeElement(original)) {
            throw new RuntimeException("Bad private Call");       
        }
            
        Attribute href = original.getAttribute("href");
        if (href == null) { 
            throw new MissingHrefException("Missing href attribute");
        }
          
        Attribute baseAttribute
          = original.getAttribute("base", Namespace.XML_NAMESPACE);
        if (baseAttribute != null) {
            base = baseAttribute.getValue();
        }
          
        URL remote;
        if (base != null) {
            try {
              URL context = new URL(base);
              remote = new URL(context, href.getValue());
            }
            catch (MalformedURLException ex) {
               XIncludeException xex = new UnavailableResourceException(
                 "Unresolvable URL " + base + "/" + href);
               xex.setRootCause(ex);
               throw xex;
            }
        }
        else { // base == null
            try {
                remote = new URL(href.getValue());
            }
            catch (MalformedURLException ex) {
                XIncludeException xex = new UnavailableResourceException(
                  "Unresolvable URL " + href.getValue());
                xex.setRootCause(ex);
                throw xex;
            }
        }
    
        boolean parse = true;
        Attribute parseAttribute = original.getAttribute("parse");
        if (parseAttribute != null) {
            String parseValue = parseAttribute.getValue();
            if (parseValue.equals("text")) parse = false;
            else if (!parseValue.equals("xml")) {
                throw new BadParseAttributeException(
                  parseAttribute + "is not a legal value for the parse attribute"
                );
            } 
        }
    
        if (parse) {
            // System.err.println("parsed");
                     // checks for equality (OK) or identity (not OK)????
            if (bases.contains(remote.toExternalForm())) {
                // need to figure out how to get file and number where
                // bad include occurs
                throw new CircularIncludeException(
                  "Circular XInclude Reference to "
                  + remote.toExternalForm() + " in " 
                );
            }
    
            try {
                Document doc = builder.build(remote); // this Document object never leaves this method
                // System.err.println(doc);
                bases.push(remote.toExternalForm());
                // This is the point where I need to select out 
                // the nodes pointed to by the XPointer
                // I really need to push this out into a separate method
                // that returns a list of the nodes pointed to by the XPointer
                String fragment = remote.getRef();
                 
                 
                // I need to return the full document child list including comments and PIs, 
                // not just the resolved root
                Element root = doc.getRootElement();
                List topLevelNodes = doc.getContent();
                int rootPosition = topLevelNodes.indexOf(root);
                List beforeRoot = topLevelNodes.subList(0, rootPosition);
                List afterRoot = topLevelNodes.subList(rootPosition+1, topLevelNodes.size());
                List rootList = resolve(root, bases);
                List resultList = new LinkedList();
                resultList.addAll(beforeRoot);
                resultList.addAll(rootList);
                resultList.addAll(afterRoot);

                // the top-level things I return should be disconnected from their parents                
                for (int i = 0; i < resultList.size(); i++) {
                    Object o = resultList.get(i);
                    if (o instanceof Element) {
                      Element element = (Element) o;
                      List nodes = resolve(element, bases);
                      resultList.addAll(i, nodes);
                      i += nodes.size();
                      resultList.remove(i);
                      i--;
                      // System.err.println(element);
                      element.detach();     
                    } 
                    if (o instanceof Comment) {
                      Comment comment = (Comment) o;
                      comment.detach();     
                    } 
                    if (o instanceof ProcessingInstruction) {
                      ProcessingInstruction pi = (ProcessingInstruction) o;
                      pi.detach();     
                    } 
                }
                bases.pop();
                return resultList;
              }
              // should this be a MalformedResourceException????
              // probably; maybe check on why JDOMException was thrown
              catch (JDOMException e) {
                  XIncludeException xex = new UnavailableResourceException(
                    "Unresolvable URL " + href.getValue());
                  xex.setRootCause(e);
                  throw xex;
              }
          }
          else { // unparsed, insert text
            String encoding = original.getAttributeValue("encoding");
            String s = downloadTextDocument(remote, encoding);
            List resultList = new LinkedList();
            resultList.add(s);
            return resultList;
          }
  
    }

    private static Element resolveNonXIncludeElement(Element original, Stack bases)
      throws CircularIncludeException, MalformedResourceException, 
      UnavailableResourceException, XIncludeException {

        String base = "";
        if (bases.size() != 0) base = (String) bases.peek();

        // Not an include element; a copy of this element in which its
        // descendants have been resolved will be returned
        // recursively process children
        Element result = new Element(original.getName(), original.getNamespace());
        Iterator attributes = original.getAttributes().iterator();
        while (attributes.hasNext()) {
            Attribute a = (Attribute) attributes.next();
            result.setAttribute((Attribute) a.clone());
        }
        List newChildren = result.getContent(); // live list

        Iterator originalChildren = original.getContent().iterator();
        while (originalChildren.hasNext()) {
            Object o = originalChildren.next();
            if (o instanceof Element) {
                Element element = (Element) o;
                if (isIncludeElement(element)) {
                    newChildren.addAll(resolveXIncludeElement(element, bases));
                }
                else {
                    newChildren.add(resolveNonXIncludeElement(element, bases));
                }
            }
            else if (o instanceof String) {
                newChildren.add(o);
            }
            else if (o instanceof Text) {
                newChildren.add(o);
            }
            else if (o instanceof CDATA) {
                newChildren.add(o);
            }
            else if (o instanceof Comment) {
                Comment c = (Comment) o;
                newChildren.add(c.clone());
            }
            else if (o instanceof EntityRef) {
                EntityRef entity = (EntityRef) o;
                newChildren.add(entity.clone());
            }
            else if (o instanceof ProcessingInstruction) {
                ProcessingInstruction pi = (ProcessingInstruction) o;
                newChildren.add(pi.clone());
            }
            else {
                throw new XIncludeException("Unexpected Type " + o.getClass());
            }
        } // end while

        return result;
  
    }


  /**
    * <p>
    * This utility method reads a document at a specified URL
    * and returns the contents of that document as a <code>String</code>.
    * It's used to include files with <code>parse="text"</code>.
    * </p>
    *
    * @param source   <code>URL</code> of the document that will be stored in 
    *                 <code>String</code>. 
    * @param  encoding Encoding of the document; e.g. UTF-8,
    *                  ISO-8859-1, etc.
    * @return String  The document retrieved from the source <code>URL</code>.
    * @throws UnavailableResourceException if the source document cannot be located
    *                                  or cannot be read.
    */    
    public static String downloadTextDocument(URL source, String encoding) 
      throws UnavailableResourceException {
         
        if (encoding == null || encoding.equals("")) encoding = "UTF-8"; 
        try {
            StringBuffer s = new StringBuffer();
            URLConnection uc = source.openConnection();
            String encodingFromHeader = uc.getContentEncoding();
            String contentType = uc.getContentType();
            InputStream in = new BufferedInputStream(uc.getInputStream());
            if (encodingFromHeader != null) encoding = encodingFromHeader;
            else {
                // What if file does not have a MIME type but name ends in .xml????
                // MIME types are case-insensitive
                // Java may be picking this up from file URL
                if (contentType != null) {
                    contentType = contentType.toLowerCase();
                    if (contentType.equals("text/xml") 
                      || contentType.equals("application/xml")   
                      || (contentType.startsWith("text/") && contentType.endsWith("+xml") ) 
                      || (contentType.startsWith("application/") && contentType.endsWith("+xml"))) {
                         encoding = EncodingHeuristics.readEncodingFromStream(in);
                    }
                }
            }
            InputStreamReader reader = new InputStreamReader(in, encoding);
            int c;
            while ((c = in.read()) != -1) {
              if (c == '<') s.append("&lt;");
              else if (c == '&') s.append("&amp;");
              else s.append((char) c);
            }
            return s.toString();
        }
        catch (UnsupportedEncodingException e) {
            UnavailableResourceException ex = new UnavailableResourceException(
              "Encoding " + encoding + " not recognized for included document: " 
              + source.toExternalForm());
            ex.setRootCause(e);
            throw ex;
        }
        catch (IOException e) {
            UnavailableResourceException ex = new UnavailableResourceException(
              "Document not found: " + source.toExternalForm());
            ex.setRootCause(e);
            throw ex;
        }
      
    }

    /**
      * <p>
      * The driver method for the XIncluder program.
      * I'll probably move this to a separate class soon.
      * </p>
      *
      * @param args  <code>args[0]</code> contains the URL or file name 
      *              of the first document to be processed; <code>args[1]</code>
      *              contains the URL or file name 
      *              of the second document to be processed, etc. 
      */
    public static void main(String[] args) {
  
        SAXBuilder builder = new SAXBuilder();
        XMLOutputter outputter = new XMLOutputter();
        for (int i = 0; i < args.length; i++) {
            try {
                Document input = builder.build(args[i]);
                // absolutize URL
                String base = args[i];
                if (base.indexOf(':') < 0) {
                    File f = new File(base);
                    base = f.toURL().toExternalForm();
                }
                Document output = resolve(input, base);
                // need to set encoding on this to Latin-1 and check what
                // happens to UTF-8 curly quotes
                outputter.output(output, System.out);
            }
            catch (Exception e) {
                System.err.println(e);
                e.printStackTrace();
            }
        }
  
    }

}
Previous | Next | Top | Cafe con Leche