Web Client Programming with Java


Web Client Programming with Java

Elliotte Rusty Harold

SDExpo 2000 East, November 2, 2000

elharo@metalab.unc.edu

http://metalab.unc.edu/javafaq/


What is Web Client Programming with Java?


Parsing HTML is Hard


Swing Supports HTML


HTML on Components

    JButton jb = new JButton("<html><b><i>Hello World!</i></b></html>");

Avoid Uppercase HTML

Upper case HTML doesn't work:

    JButton jb = new JButton("<HTML><B><I>Hello World!</I></B></HTML>");

On the other hand, Sun has no qualms with malformed HTML that omits the end tags like this:

 JButton jb = new JButton("<html><b><i>Hello World!");

Including HTML in a JLabel

import java.applet.*;
import javax.swing.*;

 
public class HTMLLabelApplet extends JApplet {

  public void init() {
  
    JLabel theText = new JLabel(
     "<html>Hello! This is a multiline label with <b>bold</b> "
     + "and <i>italic</i> text. <P> "
     + "It can use paragraphs, horizontal lines, <hr> "
     + "<font color=red>colors</font> "
     + "and most of the other basic features of HTML 3.2</html>");
   
    this.getContentPane().add(theText);
  
  }

}

The Actual Applet


What's Supported?


JEditorPane


JEditorPane Constructors

public JEditorPane()

public JEditorPane(URL initialPage) throws IOException

public JEditorPane(String url) throws IOException

public JEditorPane(String mimeType, String text)


JEditorPane setPage() and setText()

public void setPage(URL page) throws IOException

public void setPage(String url) throws IOException

public void setText(String text)


Using a JEditorPane to display a web page

import javax.swing.text.*;
import javax.swing.*;
import java.io.*;
import java.awt.*;

public class OReillyHomePage {

  public static void main(String[] args) {
        
     JEditorPane jep = new JEditorPane();
     jep.setEditable(false);   
     
     try {
       jep.setPage("http://www.oreilly.com");
     }
     catch (IOException e) {
       jep.setContentType("text/html");
       jep.setText("<html>Could not load http://www.oreilly.com </html>");
     } 
      
     JScrollPane scrollPane = new JScrollPane(jep);     
     JFrame f = new JFrame("O'Reilly & Associates");
     // Next line requires Java 1.3
     f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
     f.getContentPane().add(scrollPane);
     f.setSize(512, 342);
     f.show();
    
  }

}

JEditorPane displaying a Web Page


What JEditorPane Doesn't Do


The second constructor

public JEditorPane(URL u)


     JFrame f = new JFrame("O'Reilly & Associates");
     f.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE);
    
     try {
       URL u = new URL("http://www.oreilly.com");
       JEditorPane jep = new JEditorPane(u);
       jep.setEditable(false);   
       JScrollPane scrollPane = new JScrollPane(jep);     
       f.getContentPane().add(scrollPane);
     }
     catch (IOException e) {
       f.getContentPane().add(
        new Label("Could not load http://www.oreilly.com"));
     } 
      
     f.setSize(512, 342);
     f.show();

The third constructor

public JEditorPane(String url)


     try {
       JEditorPane jep = new JEditorPane("http://www.oreilly.com");
       jep.setEditable(false);   
       JScrollPane scrollPane = new JScrollPane(jep);     
       f.getContentPane().add(scrollPane);
     }
     catch (IOException e) {
       f.getContentPane().add(
        new Label("Could not load http://www.oreilly.com"));
     } 

The fourth constructor

public JEditorPane(String mimeType, String text)

JEditorPane jep = new JEditorPane("text/html",
 "<html><h1>Hello World!</h1> <h2>Goodbye World!</h2></html>");

Constructing HTML User Interfaces on the Fly

import javax.swing.text.*;
import javax.swing.*;
import java.io.*;
import java.awt.*;

public class Fibonacci {

  public static void main(String[] args) {
              
     StringBuffer result = 
      new StringBuffer("<html><body><h1>Fibonacci Sequence</h1><ol>");   
     
     long low = 0;
     long high = 1;
        
     for (int i = 0; i < 50; i++) {
       result.append("<li>");
       result.append(low);
       long temp = high;
       high = low + high;
       low = temp;
     }  
        
     result.append("</ol></body></html>");
     
     JEditorPane jep = new JEditorPane("text/html", result.toString());
     jep.setEditable(false);    
      
     JScrollPane scrollPane = new JScrollPane(jep);     
     JFrame f = new JFrame("Fibonacci Sequence");
     f.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE);
     f.getContentPane().add(scrollPane);
     f.setSize(512, 342);
     f.show();
    
  }

}

Constructing HTML User Interfaces on the Fly, an Example


Handling Hyperlinks


Three kinds of link event

HyperlinkEvents are fired not just when the user clicks the link but also when the mouse enters or exits the link area. Thus you'll want to check the type of the event before changing the page with the getEventType() method:

public HyperlinkEvent.EventType getEventType()

This will return one of the three mnemonic constants


HyperlinkListener Example

import javax.swing.*;
import javax.swing.event.*;

 
public class LinkFollower implements HyperlinkListener {

  private JEditorPane pane;
  
  public LinkFollower(JEditorPane pane) {
    this.pane = pane;
  }

  public void hyperlinkUpdate(HyperlinkEvent evt) {
    
    if (evt.getEventType() == HyperlinkEvent.EventType.ACTIVATED) {
      try {
        pane.setPage(evt.getURL());        
      }
      catch (Exception e) {        
      } 
    }
    
  }

}

A very simple web browser

import javax.swing.text.*;
import javax.swing.*;
import java.io.*;
import java.awt.*;

public class SimpleWebBrowser {

  public static void main(String[] args) {
        
    // get the first URL
    String initialPage = "http://metalab.unc.edu/javafaq/";
    if (args.length > 0) initialPage = args[0];       
    
    // set up the editor pane
    JEditorPane jep = new JEditorPane();
    jep.setEditable(false);   
    jep.addHyperlinkListener(new LinkFollower(jep));
    
    try {
      jep.setPage(initialPage);      
    }
    catch (IOException e) {
      System.err.println("Usage: java SimpleWebBrowser url"); 
      System.err.println(e);
      System.exit(-1);
    }
      
    // set up the window
    JScrollPane scrollPane = new JScrollPane(jep);     
    JFrame f = new JFrame("Simple Web Browser");
    f.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE);
    f.getContentPane().add(scrollPane);
    f.setSize(512, 342);
    f.show();
    
  }

}

Reading HTML Directly

public void read(InputStream in, Object document) throws IOException


An example of reading HTML directly

     JEditorPane jep = new JEditorPane();
     jep.setEditable(false);   
     EditorKit htmlKit = jep.getEditorKitForContentType("text/html");
     HTMLDocument doc = (HTMLDocument) htmlKit.createDefaultDocument();
     jep.setEditorKit(htmlKit);
     
     try {
       URL u = new URL("http://www.macfaq.com");
       InputStream in = u.openStream();
       jep.read(in, doc);
     }
     catch (IOException e) {
       System.err.println(e);
     } 
      
     JScrollPane scrollPane = new JScrollPane(jep);     
     JFrame f = new JFrame("Macfaq");
     f.setDefaultCloseOperation(WindowConstants.DISPOSE_ON_CLOSE);
     f.getContentPane().add(scrollPane);
     f.setSize(512, 342);
     f.show();

Parsing HTML


javax.swing.text.html


HTMLEditorKit.Parser


HTMLEditorKit.ParserCallback

To parse an HTML file you write a subclass of HTMLEditorKit.ParserCallback that responds to text and tags as you desire. Then you pass an instance of your subclass to the HTMLEditorKit.Parser's parse() method along with the Reader from which the HTML will be read:

public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharacterSet) throws IOException


Getting an Instance of HTMLEditorKit.ParserCallback


A Backdoor approach to an HTML Parser

import javax.swing.text.html.*;

public class ParserGetter extends HTMLEditorKit {

  // purely to make this method public
  public HTMLEditorKit.Parser getParser(){
    return super.getParser();
  }
  
} 

The parse() method of HTMLEditorKit.Parser

public abstract void parse(Reader input, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException


HTMLEditorKit.ParserCallback

The ParserCallback class is a public inner class inside javax.swing.text.html.HTMLEditorKit.

public static class HTMLEditorKit.ParserCallback extends Object

It has a single public, noargs constructor:

public HTMLEditorKit.ParserCallback()

However, you probably won't use this directly because the standard implementation of this class does nothing. It exists to be subclassed. It has six callback methods that do nothing. You will override these methods to respond to specific items seen in the input stream as the document is parsed.

public void handleText(char[] text, int position)
public void handleComment(char[] text, int position)
public void handleStartTag(HTML.Tag tag, MutableAttributeSet attributes, int position)
public void handleEndTag(HTML.Tag tag, int position)
public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, int position)
public void handleError(String errorMsg, int position)

There's also a flush() method you use to perform any final cleanup. The parser invokes this method once after it's finished parsing the document.

public void flush() throws BadLocationException


Tag Stripping Example

import javax.swing.text.html.*;
import java.io.*;


public class TagStripper extends HTMLEditorKit.ParserCallback {

  private Writer out;
  
  public TagStripper(Writer out) {
    this.out = out; 
  }  
  
  public void handleText(char[] text, int position) {
    try {
      out.write(text);
      out.flush(); 
    }
    catch (IOException e) {
      System.err.println(e); 
    }
  }
  
}

Tag Stripping Example, Part II

// Begin by retrieving a parser using the ParserGetter class:
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
    
// Next, construct an instance of your callback class like this:

    HTMLEditorKit.ParserCallback callback 
     = new TagStripper(new OutputStreamWriter(System.out));
     
// Then get a stream you can read the HTML document from. For example, 

    try {
      URL u = new URL("http://www.oreilly.com");
      InputStream in = u.openStream();
      InputStreamReader r = new InputStreamReader(in);

// Finally, pass the Reader and HTMLEditorKit.ParserCallback to the 
// HTMLEditorKit.Parser's parse() method, like this:

      parser.parse(r, callback, false);
    }
    catch (IOException e) {
      System.err.println(e); 
    }

Details

<H1> Here's   the   Title </H1>

<P> Here's the text </P>

What actually comes out of the tag stripper is:

Here's the TitleHere's the text

The single exception is the PRE element which maintains all white space in its contents unedited.


Retaining Line Breaks

Short of implementing your own parser, I don't know of any way to retain all the stripped space. But you can include the minimum necessary line breaks and white space by looking at the tags as well as the text. Generally you expect a single break in HTML when you see one of these tags:

<BR>
<LI>
<TR>

You expect a double break (paragraph break) when you see one of these tags:

<P>
</H1> </H2> </H3> </H4> </H5> </H6>
<HR>
<DIV>
</UL> </OL> </DL>

To include line breaks in the output you have to look at each tag as it's processed and determine whether it falls in one of these sets. This is straight- forward because the first argument passed to each of the tag callback methods is an HTML.Tag object.


HTML.Tag

HTML.Tag is a public inner class in the javax.swing.text.html.HTML class.

public static class HTML.Tag extends Object

It has these four methods:

public boolean isBlock() public boolean breaksFlow() public boolean isPreformatted() public String toString()


LineBreakingTagStripper

import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import java.io.*;
import java.net.*;


public class LineBreakingTagStripper extends HTMLEditorKit.ParserCallback {

  private Writer out;
  private String lineSeparator;
  
  public LineBreakingTagStripper(Writer out) {
    this(out, System.getProperty("line.separator", "\r\n")); 
  }  
  
  public LineBreakingTagStripper(Writer out, String lineSeparator) {
    this.out = out; 
    this.lineSeparator = lineSeparator;
  }  
  
  public void handleText(char[] text, int position) {
    try {
      out.write(text);
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e); 
    }
  }
  
  public void handleEndTag(HTML.Tag tag, int position) {

    try {
      if (tag.isBlock()) {
        out.write(lineSeparator);
        out.write(lineSeparator);
      }
      else if (tag.breaksFlow()) {
        out.write(lineSeparator);
      }
    }
    catch (IOException e) {
      System.err.println(e); 
    }
    
  }
  public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attributes, 
   int position) {
    
    try {
      if (tag.isBlock()) {
        out.write(lineSeparator);
        out.write(lineSeparator);
      }
      else if (tag.breaksFlow()) {
        out.write(lineSeparator);
      }
      else {
        out.write(' '); 
      }
    }
    catch (IOException e) {
      System.err.println(e); 
    }
 
  }
  
}

Identifying Tags

You determine the type of a tag by comparing it against these 73 mnemonic constants from the HTML.Tag class:


An Outliner Example

import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import java.io.*;
import java.net.*;
import java.util.*;

public class Outliner extends HTMLEditorKit.ParserCallback {

  private Writer out;
  private int level = 0;
  private boolean inHeader=false;
  private static String lineSeparator 
   = System.getProperty("line.separator", "\r\n");
  
  public Outliner(Writer out) {
    this.out = out;
  }

  public void handleStartTag(HTML.Tag tag, 
   MutableAttributeSet attributes, int position) {
    
    int newLevel = 0;
    if (tag == HTML.Tag.H1) newLevel = 1;
    else if (tag == HTML.Tag.H2) newLevel = 2;
    else if (tag == HTML.Tag.H3) newLevel = 3;
    else if (tag == HTML.Tag.H4) newLevel = 4;
    else if (tag == HTML.Tag.H5) newLevel = 5;
    else if (tag == HTML.Tag.H6) newLevel = 6;
    else return;
    
    this.inHeader = true;
    try {
      if (newLevel > this.level) {
        for (int i =0; i < newLevel-this.level; i++) {
          out.write("<ul>" + lineSeparator + "<li>");
        }
      }
      else if (newLevel < this.level) {
        for (int i =0; i < this.level-newLevel; i++) {
          out.write(lineSeparator + "</ul>" + lineSeparator);
        }
        out.write(lineSeparator + "<li>");
      }
      else {
        out.write(lineSeparator + "<li>"); 
      }
      this.level = newLevel;
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e);
    }
    
  }
  
  public void handleEndTag(HTML.Tag tag, int position) {

    if (tag == HTML.Tag.H1 || tag == HTML.Tag.H2 
     || tag == HTML.Tag.H3 || tag == HTML.Tag.H4
     || tag == HTML.Tag.H5 || tag == HTML.Tag.H6) {
      inHeader = false;
    }
    
    // work around bug in the parser that fails to call flush
    if (tag == HTML.Tag.HTML) this.flush();
    
  }
  
  
  public void handleText(char[] text, int position) { 
    
    if (inHeader) {
      try { 
        out.write(text);
        out.flush();
      }
      catch (IOException e) {
        System.err.println(e);
      }
    }
    
  }
  
  public void flush() {
    try {
      while (this.level-- > 0) {
        out.write(lineSeparator + "</ul>");   
      } 
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e);
    }
  } 
  
  public static void main(String[] args) { 
    
    ParserGetter kit = new ParserGetter();
    HTMLEditorKit.Parser parser = kit.getParser();
  
    try {
      URL u = new URL(args[0]);
      InputStream in = u.openStream();
      InputStreamReader r = new InputStreamReader(in);
      HTMLEditorKit.ParserCallback callback = new Outliner
       (new OutputStreamWriter(System.out));
      parser.parse(r, callback, false);
    }
    catch (IOException e) {
      System.err.println(e); 
    }
    catch (ArrayIndexOutOfBoundsException e) {
      System.out.println("Usage: java Outliner url"); 
    }
          
  }
  
}

Outliner Example Output

D:\JAVA\JNP2\examples\08>java Outliner http://metalab.unc.edu/xml/
<ul>
<li> Cafe con Leche XML News and Resources<ul>
<li><ul>
<li>XML Overview
<li>Random Notes
<li>Specifications
<li>Books
<li>XML Resources
<li>Development Tools<ul>
<li>Validating Parsers
<li>Non-validating Parsers
<li>Online Validators and Syntax Checkers
<li>Formatting Engines
<li>Browsers
<li>Class Libraries
<li>Editors
<li>XLL
<li>XML Applications
<li>External Sites
</ul>

</ul>

<li>Quote of the Day
<li>Today's News
<li>Recommended Reading
<li>Recent News</ul>
</ul>

View in Browser

Attributes

The second argument to the handleStartTag() and handleSimpletag() callback methods is an instance of the javax.swing.text.MutableAttributeSet class which allows you to see what attributes are attached to a particular tag.

public abstract interface MutableAttributeSet extends AttributeSet

The AttributeSet interface declares these methods:

public int getAttributeCount()
public boolean isDefined(Object name)
public boolean containsAttribute(Object name, Object value)
public boolean containsAttributes(AttributeSet attributes)
public boolean isEqual(AttributeSet attributes)
public AttributeSet copyAttributes()
public Enumeration getAttributeNames()
public Object getAttribute(Object name)

public AttributeSet getResolveParent()


AttributeSet Example

Given an AttributeSet this method prints the attributes in name=value format:

  private void listAttributes(AttributeSet attributes) {
    Enumeration e = attributes.getAttributeNames();
    while (e.hasMoreElements()) {
      Object name = e.nextElement();
      Object value = attributes.getAttribute(name);
      System.out.println(name + "=" + value);
    }
  }

Although the argument and return types of these methods are mostly declared in terms of java.lang.Object, in practice all values are instances of java.lang.String while all names are instances of the public inner class javax.swing.text.html.HTML.Attribute.


The 80 predefined attributes are:


MutableAttributeSet

The MutableAttributeSet interface adds six methods to add and remove attributes from the set:

public void addAttribute(Object name, Object value)
public void addAttributes(AttributeSet attributes)
public void removeAttribute(Object name)
public void removeAttributes(Enumeration names)
public void removeAttributes(AttributeSet attributes)
public void setResolveParent(AttributeSet parent)

Again the values are strings and the names are HTML.Attribute objects.


PageSaver

import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
import java.io.*;
import java.net.*;
import java.util.*;

public class PageSaver extends HTMLEditorKit.ParserCallback {

  private Writer out;
  private URL base;
  
  public PageSaver(Writer out, URL base) {
    this.out = out;
    this.base = base;
  }

  public void handleStartTag(HTML.Tag tag, 
   MutableAttributeSet attributes, int position) {
    try {  
      out.write("<" + tag);
      this.writeAttributes(attributes);
      // for the <APPLET> tag we may have to add a codebase attribute
      if (tag == HTML.Tag.APPLET 
       && attributes.getAttribute(HTML.Attribute.CODEBASE) == null) {
        String codebase = base.toString();
        if (codebase.endsWith(".htm") || codebase.endsWith(".html")) {
          codebase = codebase.substring(0, codebase.lastIndexOf('/'));   
        }
        out.write(" codebase=\"" + codebase + "\""); 
      }
      out.write(">");
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e);
      e.printStackTrace();
    }
  }
  
  public void handleEndTag(HTML.Tag tag, int position) {
    try {    
      out.write("</" + tag + ">");
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e);
    }
  }
  
  private void writeAttributes(AttributeSet attributes) 
   throws IOException {
    
    Enumeration e = attributes.getAttributeNames();
    while (e.hasMoreElements()) {
      Object name = e.nextElement();
      String value = (String) attributes.getAttribute(name);
      try {
        if (name == HTML.Attribute.HREF || name == HTML.Attribute.SRC 
         || name == HTML.Attribute.LOWSRC 
         || name == HTML.Attribute.CODEBASE ) {
          URL u = new URL(base, value);
          out.write(" " + name + "=\"" + u + "\"");              
        }
        else {
          out.write(" " + name + "=\"" + value + "\"");
        }
      }
      catch (MalformedURLException ex) {
        System.err.println(ex);
        System.err.println(base);
        System.err.println(value);
        ex.printStackTrace();
      }
    }
  }
  
  public void handleComment(char[] text, int position) { 
    
    try {
      out.write("<!-- ");
      out.write(text);
      out.write(" -->");
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e);
    }
    
  }
  
  public void handleText(char[] text, int position) { 
    
    try { 
      out.write(text);
      out.flush();
    }
    catch (IOException e) {
      System.err.println(e);
      e.printStackTrace();
    }
    
  }
  
  public void handleSimpleTag(HTML.Tag tag, 
   MutableAttributeSet attributes, int position) {
    try {
      out.write("<" + tag);
      this.writeAttributes(attributes);
      out.write(">");
    }
    catch (IOException e) {
      System.err.println(e);
      e.printStackTrace();
    }
  }

  public static void main(String[] args) { 
    
    for (int i = 0; i < args.length; i++) { 
      
      ParserGetter kit = new ParserGetter();
      HTMLEditorKit.Parser parser = kit.getParser();
    
      try {
        URL u = new URL(args[i]);
        InputStream in = u.openStream();
        InputStreamReader r = new InputStreamReader(in);
        String remoteFileName = u.getFile();
        if (remoteFileName.endsWith("/")) {
          remoteFileName += "index.html";
        }
        if (remoteFileName.startsWith("/")) {
          remoteFileName = remoteFileName.substring(1);
        }
        File localDirectory = new File(u.getHost());
        while (remoteFileName.indexOf('/') > -1) {
          String part = remoteFileName.substring(0, remoteFileName.indexOf('/'));
          remoteFileName = remoteFileName.substring(remoteFileName.indexOf('/')+1);
          localDirectory = new File(localDirectory, part);
        }
        if (localDirectory.mkdirs()) {
          File output = new File(localDirectory, remoteFileName);
          FileWriter out = new FileWriter(output);
          HTMLEditorKit.ParserCallback callback = new PageSaver(out, u);
          parser.parse(r, callback, false);
        }
      }
      catch (IOException e) {
        System.err.println(e); 
        e.printStackTrace();
      }
      
    } 
    
  }
  
}

To Learn More


Questions?


Index | Cafe au Lait

Copyright 2000 Elliotte Rusty Harold
elharo@metalab.unc.edu
Last Modified November 8, 2000