001    /*
002     * Copyright 2005 John G. Wilson
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *     http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     *
016     */
017    
018    package groovy.util;
019    
020    import groovy.util.slurpersupport.GPathResult;
021    import groovy.util.slurpersupport.Node;
022    import groovy.util.slurpersupport.NodeChild;
023    import groovy.xml.FactorySupport;
024    
025    import java.io.File;
026    import java.io.FileInputStream;
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.io.Reader;
030    import java.io.StringReader;
031    import java.net.URL;
032    import java.util.HashMap;
033    import java.util.Hashtable;
034    import java.util.Map;
035    import java.util.Stack;
036    
037    import javax.xml.parsers.ParserConfigurationException;
038    import javax.xml.parsers.SAXParser;
039    import javax.xml.parsers.SAXParserFactory;
040    
041    import org.xml.sax.Attributes;
042    import org.xml.sax.DTDHandler;
043    import org.xml.sax.EntityResolver;
044    import org.xml.sax.ErrorHandler;
045    import org.xml.sax.InputSource;
046    import org.xml.sax.SAXException;
047    import org.xml.sax.SAXNotRecognizedException;
048    import org.xml.sax.SAXNotSupportedException;
049    import org.xml.sax.XMLReader;
050    import org.xml.sax.helpers.DefaultHandler;
051    
052    /**
053     * @author John Wilson
054     *
055     */
056    
057    public class XmlSlurper extends DefaultHandler {
058      private final XMLReader reader;
059      private Node currentNode = null;
060      private final Stack stack = new Stack();
061      private final StringBuffer charBuffer = new StringBuffer();
062      private final Map namespaceTagHints = new Hashtable();
063      private boolean keepWhitespace = false;
064    
065      public XmlSlurper() throws ParserConfigurationException, SAXException {
066        this(false, true);
067      }
068      
069      public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException {
070        SAXParserFactory factory = FactorySupport.createSaxParserFactory();
071        factory.setNamespaceAware(namespaceAware);
072        factory.setValidating(validating);
073        this.reader = factory.newSAXParser().getXMLReader();
074      }
075      
076      public XmlSlurper(final XMLReader reader) {
077        this.reader = reader;
078      }
079      
080      public XmlSlurper(final SAXParser parser) throws SAXException {
081        this(parser.getXMLReader());
082      }
083      
084      /**
085       * @param keepWhitespace
086       * 
087       * If true then whitespace before elements is kept.
088       * The deafult is to discard the whitespace.
089       */
090      public void setKeepWhitespace(boolean keepWhitespace) {
091          this.keepWhitespace = keepWhitespace;
092      }
093      
094      /**
095       * @return The GPathResult instance created by consuming a stream of SAX events
096       * Note if one of the parse methods has been called then this returns null
097       * Note if this is called more than once all calls after the first will return null
098       *
099       */
100      public GPathResult getDocument() {
101        try {
102          return new NodeChild(this.currentNode, null, this.namespaceTagHints);
103        } finally {
104          this.currentNode = null;
105        }
106      }
107      
108      /**
109       * Parse the content of the specified input source into a GPathResult object
110       * 
111       * @param input
112       * @return An object which supports GPath expressions
113       * @throws IOException
114       * @throws SAXException
115       */
116      public GPathResult parse(final InputSource input) throws IOException, SAXException {
117        this.reader.setContentHandler(this);
118        this.reader.parse(input);
119        
120        return getDocument();
121        
122      }
123      
124      /**
125       * Parses the content of the given file as XML turning it into a GPathResult object
126       * 
127       * @param file
128       * @return An object which supports GPath expressions
129       * @throws IOException
130       * @throws SAXException
131       */
132      public GPathResult parse(final File file) throws IOException, SAXException {
133      final InputSource input = new InputSource(new FileInputStream(file));
134        
135        input.setSystemId("file://" + file.getAbsolutePath());
136        
137        return parse(input);
138        
139      }
140      
141      /**
142       * Parse the content of the specified input stream into an GPathResult Object.
143       * Note that using this method will not provide the parser with any URI
144       * for which to find DTDs etc
145       * 
146       * @param input
147       * @return An object which supports GPath expressions
148       * @throws IOException
149       * @throws SAXException
150       */
151      public GPathResult parse(final InputStream input) throws IOException, SAXException {
152        return parse(new InputSource(input));
153      }
154      
155      /**
156       * Parse the content of the specified reader into a GPathResult Object.
157       * Note that using this method will not provide the parser with any URI
158       * for which to find DTDs etc
159       * 
160       * @param in
161       * @return An object which supports GPath expressions
162       * @throws IOException
163       * @throws SAXException
164       */
165      public GPathResult parse(final Reader in) throws IOException, SAXException {
166        return parse(new InputSource(in));
167      }
168      
169      /**
170       * Parse the content of the specified URI into a GPathResult Object
171       * 
172       * @param uri
173       * @return An object which supports GPath expressions
174       * @throws IOException
175       * @throws SAXException
176       */
177      public GPathResult parse(final String uri) throws IOException, SAXException {
178        return parse(new InputSource(uri));
179      }
180      
181      /**
182       * A helper method to parse the given text as XML
183       * 
184       * @param text
185       * @return An object which supports GPath expressions
186       */
187      public GPathResult parseText(final String text) throws IOException, SAXException {
188        return parse(new StringReader(text));
189      }
190      
191      // Delegated XMLReader methods
192      //------------------------------------------------------------------------
193    
194      /* (non-Javadoc)
195       * @see org.xml.sax.XMLReader#getDTDHandler()
196       */
197      public DTDHandler getDTDHandler() {
198          return this.reader.getDTDHandler();
199      }
200    
201      /* (non-Javadoc)
202       * @see org.xml.sax.XMLReader#getEntityResolver()
203       */
204      public EntityResolver getEntityResolver() {
205          return this.reader.getEntityResolver();
206      }
207    
208      /* (non-Javadoc)
209       * @see org.xml.sax.XMLReader#getErrorHandler()
210       */
211      public ErrorHandler getErrorHandler() {
212          return this.reader.getErrorHandler();
213      }
214    
215      /* (non-Javadoc)
216       * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
217       */
218      public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
219          return this.reader.getFeature(uri);
220      }
221    
222      /* (non-Javadoc)
223       * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
224       */
225      public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
226          return this.reader.getProperty(uri);
227      }
228    
229      /* (non-Javadoc)
230       * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
231       */
232      public void setDTDHandler(final DTDHandler dtdHandler) {
233          this.reader.setDTDHandler(dtdHandler);
234      }
235    
236      /* (non-Javadoc)
237       * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
238       */
239      public void setEntityResolver(final EntityResolver entityResolver) {
240          this.reader.setEntityResolver(entityResolver);
241      }
242    
243      /**
244       * Resolves entities against using the suppied URL as the base for relative URLs
245       * 
246       * @param base
247       * The URL used to resolve relative URLs
248       */
249      public void setEntityBaseUrl(final URL base) {
250          this.reader.setEntityResolver(new EntityResolver() {
251              public InputSource resolveEntity(final String publicId, final String systemId) throws IOException {
252                  return new InputSource(new URL(base, systemId).openStream());
253              }
254          });
255      }
256    
257      /* (non-Javadoc)
258       * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
259       */
260      public void setErrorHandler(final ErrorHandler errorHandler) {
261          this.reader.setErrorHandler(errorHandler);
262      }
263    
264      /* (non-Javadoc)
265       * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
266       */
267      public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
268          this.reader.setFeature(uri, value);
269      }
270    
271      /* (non-Javadoc)
272       * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
273       */
274      public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
275           this.reader.setProperty(uri, value);
276      }
277      
278      
279      // ContentHandler interface
280      //-------------------------------------------------------------------------                    
281      
282      /* (non-Javadoc)
283       * @see org.xml.sax.ContentHandler#startDocument()
284       */
285      public void startDocument() throws SAXException {
286        this.currentNode = null;
287        this.charBuffer.setLength(0);
288      }
289      
290      /* (non-Javadoc)
291       * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
292       */
293      public void startPrefixMapping(final String tag, final String uri) throws SAXException {
294        this.namespaceTagHints.put(tag, uri);
295      }
296    
297      /* (non-Javadoc)
298       * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
299       */
300      public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException {
301        addCdata();
302        
303        final Map attributes = new HashMap();
304        final Map attributeNamespaces = new HashMap();
305        
306        for (int i = atts.getLength() - 1; i != -1; i--) {
307          if (atts.getURI(i).length() == 0) {
308            attributes.put(atts.getQName(i), atts.getValue(i));
309          } else {
310            attributes.put(atts.getLocalName(i), atts.getValue(i));
311            attributeNamespaces.put(atts.getLocalName(i), atts.getURI(i));
312          }
313          
314        }
315        
316        final Node newElement;
317        
318        if (namespaceURI.length() == 0){
319          newElement = new Node(this.currentNode, qName, attributes, attributeNamespaces, namespaceURI);
320        } else {
321          newElement = new Node(this.currentNode, localName, attributes, attributeNamespaces, namespaceURI);
322        }
323        
324        if (this.currentNode != null) {
325          this.currentNode.addChild(newElement);
326        }
327        
328        this.stack.push(this.currentNode);
329        this.currentNode = newElement;
330      }
331      
332      /* (non-Javadoc)
333       * @see org.xml.sax.ContentHandler#characters(char[], int, int)
334       */
335      public void characters(final char[] ch, final int start, final int length) throws SAXException {
336        this.charBuffer.append(ch, start, length);
337      }
338      
339      /* (non-Javadoc)
340       * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
341       */
342      public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
343        addCdata();
344        
345        final Object oldCurrentNode = this.stack.pop();
346        
347        if (oldCurrentNode != null) {
348          this.currentNode = (Node)oldCurrentNode;
349        }
350      }
351      
352      /* (non-Javadoc)
353       * @see org.xml.sax.ContentHandler#endDocument()
354       */
355      public void endDocument() throws SAXException {
356      }
357      
358      // Implementation methods
359      //-------------------------------------------------------------------------           
360      
361      /**
362       * 
363       */
364      private void addCdata() {
365        if (this.charBuffer.length() != 0) {
366          //
367          // This element is preceeded by CDATA if keepWhitespace is false (the default setting) and 
368          // it's not whitespace add it to the body
369          // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
370          // but for the sort of work I'm doing ignoring the whitespace is preferable
371          //
372          final String cdata = this.charBuffer.toString();
373          
374          this.charBuffer.setLength(0);
375          if (this.keepWhitespace || cdata.trim().length() != 0) {
376            this.currentNode.addChild(cdata);
377          }
378        }   
379      }
380    }