import mshtml.*; import shdocvw.*; import javax.swing.*; import java.util.*; import java.io.IOException; /* * Copyright 2003 by Intrinsyc Software Inc. * All rights reserved. */ public class HTMLDocumentParser{ public static void main( String args[]) throws Exception { if (args.length == 0) { System.out.println("\nThis program takes a url for an argument. For example:\n"); System.out.println("java -DJINTEGRA_COINIT_VALUE=2 HTMLDocumentParser http://www.google.ca"); return; } browser = new WebBrowser(); browser.setSize(900,350); if (browser == null) System.out.println("Browser could not be created."); javax.swing.JFrame frame = new JFrame(); frame.getContentPane().add(browser); frame.pack(); frame.setVisible(true); // this is required before invoking navigate on browser, otherwise nullpointer exception browser.navigate((String) args[0], null, null, null, null); // Some sleep time required. // If not: "Argument is not a COM object: null" at com.linar.jintegra.Dispatch. while (browser.getDocument() == null) { Thread.sleep(10); } HTMLDocument doc = new HTMLDocument(browser.getDocument()); System.out.println(getDocInfo(doc)); IHTMLElementCollection frames = doc.getElementsByTagName("FRAME"); for (int i = 0; i < frames.getLength(); i++) { Integer ii = new Integer(i); Object f = frames.item(ii, ii); IHTMLFrameBase2 fb = new IHTMLFrameBase2Proxy(f); HTMLDocument fdoc = new HTMLDocument(fb.getContentWindow().getDocument()); System.out.println("\nFrame content:\n=============\n"); System.out.println(getDocInfo(fdoc)); } //sleep for 5 seconds Thread.sleep(5000); System.exit(0); com.linar.jintegra.Cleaner.releaseAll(); } static public String getDocInfo(HTMLDocument doc) { try { StringBuffer info = new StringBuffer(); info.append("Number of elements = ") .append(doc.getAll().getLength()) .append("\n") .append("Number of tags = ") .append(doc.getElementsByTagName("BODY").getLength()) .append("\n") .append("Number of tags = ") .append(doc.getElementsByTagName("LINK").getLength()) .append("\n") .append("Number of tags = ") .append(doc.getElementsByTagName("HEAD").getLength()) .append("\n") .append("Number of tags = ") .append(doc.getElementsByTagName("HTML").getLength()) .append("\n") .append("Number of tags = ") .append(doc.getElementsByTagName("META").getLength()) .append("\n") .append("Number of
 tags = ")
                    .append(doc.getElementsByTagName("PRE").getLength())
                    .append("\n")
                    .append("Number of 

tags = ") .append(doc.getElementsByTagName("PRE").getLength()) .append("\n") .append("Number of

tags = ") .append(doc.getElementsByTagName("H1").getLength()) .append("\n") .append("Number of

tags = ") .append(doc.getElementsByTagName("H2").getLength()) .append("\n") .append("Number of

tags = ") .append(doc.getElementsByTagName("H3").getLength()) .append("\n") .append("Number of
tags = ") .append(doc.getElementsByTagName("DIV").getLength()) .append("\n") .append("Number of