Converting slashdot.xml to Tab Delimited Text


import org.xml.sax.*;
import org.xml.sax.helpers.*;
import java.io.*;
import java.net.*;


public class SlashdotTab implements DocumentHandler {

  private Writer out;
  
  public SlashdotTab(Writer out) {
    this.out = out;
  }

  public SlashdotTab(OutputStream out) {
    this(new OutputStreamWriter(out));
  }

  public void setDocumentLocator(Locator locator) {}
  
  public void startDocument() throws SAXException {}

  // Never forget to flush!
  public void endDocument() throws SAXException {
    try {
      out.flush();  
    }
    catch (IOException e) {
      throw new SAXException(e);
    }       
  }
  
  /*  <story>
        <title>The Onion to buy the New York Times</title>
        <url>http://slashdot.org/articles/00/02/19/1128240.shtml</url>
        <time>2000-02-19 17:25:15</time>
        <author>CmdrTaco</author>
        <department>stuff-to-read</department>
        <topic>media</topic>
        <comments>20</comments>
        <section>articles</section>
        <image>topicmedia.gif</image>
      </story>  
  */ 
  
  // one state; either we're in a tag that needs to use
  // characters or we're not
  boolean useCharacters = false;
  
  public void startElement(String name, AttributeList atts)
   throws SAXException {
    
    if (name.equals("title") || name.equals("url") || name.equals("author") 
     || name.equals("department") || name.equals("topic") 
     || name.equals("comments") || name.equals("section") 
     || name.equals("image") ) {
      useCharacters = true;
    }
   
  }
  
  public void endElement(String name) throws SAXException {
    
    if (name.equals("title") || name.equals("url") || name.equals("author") 
     || name.equals("department") || name.equals("topic") 
     || name.equals("comments") || name.equals("section") ) {
      try {
        out.write('\t');
      }
      catch (IOException e) {
       throw new SAXException(e); 
      }
      useCharacters = false;
    }
    else if (name.equals("image")) {
      try {
        out.write("\r\n");
      }
      catch (IOException e) {
       throw new SAXException(e); 
      }
      useCharacters = false;
    }
    
  }
  
  public void characters(char[] text, int start, int length) 
   throws SAXException {
    
    if (useCharacters) {
      try {
        out.write(text, start, length); 
      }
      catch (IOException e) {
        throw new SAXException(e);
      }
    }
    
  }
  
  public void ignorableWhitespace(char[] text, int start, int length)
   throws SAXException {}
  
  public void processingInstruction(String target, String data)
   throws SAXException {}


  // Could easily have put main() method in a separate class
  public static void main(String[] args) {
    
    Parser parser;
    try {
     parser = ParserFactory.makeParser();
    }
    catch (Exception e) {
      // fall back on Xerces parser by name
      try {
        parser = ParserFactory.makeParser(
         "org.apache.xerces.parsers.SAXParser");
      }
      catch (Exception ee) {
        System.err.println("Couldn't locate a SAX parser");
        return;          
      }
    }
     
    String url = "http://www.slashdot.org/slashdot.xml";
    if (args.length != 0) {
      url = args[0]; 
    } 
      
    // Install the Document Handler      
    parser.setDocumentHandler(new SlashdotTab(System.out));
    
    // command line should offer URIs or file names
    try {
      parser.parse(url);
    }
    catch (SAXParseException e) { // well-formedness error
      System.out.println(url + " is not well formed.");
      System.out.println(e.getMessage()
       + " at line " + e.getLineNumber() 
       + ", column " + e.getColumnNumber());
    }
    catch (SAXException e) { // some other kind of error
      System.out.println(e.getMessage());
    }
    catch (IOException e) {
      System.out.println("Could not read " + url 
       + " because of the IOException " + e);
    }  
  
  }

}

Previous | Next | Top | Cafe con Leche

Copyright 2000 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified February 26, 2000