Attributes Example: XLinkSpider

import org.xmlpull.v1.*;
import java.net.*;
import java.io.*;
import java.util.*;

public class PullSpider {

  // Need to keep track of where we've been 
  // so we don't get stuck in an infinite loop
  private List spideredURIs = new Vector();

  // This linked list keeps track of where we're going.
  // Although the LinkedList class does not guarantee queue like
  // access, I always access it in a first-in/first-out fashion.
  private LinkedList queue = new LinkedList();
  
  private URL currentURL;
  private XmlPullParser parser;
  
  public PullSpider() {
      try {
        XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
        factory.setNamespaceAware(true);
        this.parser = factory.newPullParser();
      }
      catch (XmlPullParserException ex) {
         throw new RuntimeException("Could not locate a pull parser");   
      }
  }

  private void processStartTag() {
    
    String type 
     = parser.getAttributeValue("http://www.w3.org/1999/xlink", "type");
    if (type != null) {
      String href 
       = parser.getAttributeValue("http://www.w3.org/1999/xlink", "href");
          if (href != null) {
            try {
              URL foundURL = new URL(currentURL, href);
              if (!spideredURIs.contains(foundURL)) {
                queue.addFirst(foundURL);
              }
            }
           catch (MalformedURLException ex) {
             // skip it   
            }
        }
    }
  }
  
  public void spider(URL uri) {
      
    System.out.println("Spidering " + uri);
    currentURL = uri;
    try {
      parser.setInput(this.currentURL.openStream(), null);
      spideredURIs.add(currentURL);
      
      for (int event = parser.next(); event != XmlPullParser.END_DOCUMENT; event = parser.next()) {
         if (event == XmlPullParser.START_TAG) {
             processStartTag();
         }
       }  // end for
      
       while (!queue.isEmpty()) {
         URL nextURL = (URL) queue.removeLast();
         spider(nextURL);
       }
      
    }
    catch (Exception ex) {
       // skip this document
    }
    
  }

  public static void main(String[] args) throws Exception {
        
    if (args.length == 0) {
      System.err.println("Usage: java PullSpider url" );
       return;  
    }
        
    PullSpider spider = new PullSpider();
    spider.spider(new URL(args[0]));
        
  } // end main

} // end PullSpider


Previous | Next | Top | Cafe con Leche

Copyright 2000-2002 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified November 7, 2002