import mshtml.*;
import shdocvw.*;
import javax.swing.*;
import java.util.*;
import java.io.IOException;
/*
* Copyright 2003 by Intrinsyc Software Inc.
* All rights reserved.
*/
public class HTMLDocumentParser{
public static void main( String args[]) throws Exception
{
if (args.length == 0)
{
System.out.println("\nThis program takes a url for an argument. For example:\n");
System.out.println("java -DJINTEGRA_COINIT_VALUE=2 HTMLDocumentParser http://www.google.ca");
return;
}
browser = new WebBrowser();
browser.setSize(900,350);
if (browser == null)
System.out.println("Browser could not be created.");
javax.swing.JFrame frame = new JFrame();
frame.getContentPane().add(browser);
frame.pack();
frame.setVisible(true); // this is required before invoking navigate on browser, otherwise nullpointer exception
browser.navigate((String) args[0], null, null, null, null);
// Some sleep time required.
// If not: "Argument is not a COM object: null" at com.linar.jintegra.Dispatch.
while (browser.getDocument() == null) {
Thread.sleep(10);
}
HTMLDocument doc = new HTMLDocument(browser.getDocument());
System.out.println(getDocInfo(doc));
IHTMLElementCollection frames = doc.getElementsByTagName("FRAME");
for (int i = 0; i < frames.getLength(); i++) {
Integer ii = new Integer(i);
Object f = frames.item(ii, ii);
IHTMLFrameBase2 fb = new IHTMLFrameBase2Proxy(f);
HTMLDocument fdoc = new HTMLDocument(fb.getContentWindow().getDocument());
System.out.println("\nFrame content:\n=============\n");
System.out.println(getDocInfo(fdoc));
}
//sleep for 5 seconds
Thread.sleep(5000);
System.exit(0);
com.linar.jintegra.Cleaner.releaseAll();
}
static public String getDocInfo(HTMLDocument doc) {
try {
StringBuffer info = new StringBuffer();
info.append("Number of elements = ")
.append(doc.getAll().getLength())
.append("\n")
.append("Number of tags = ")
.append(doc.getElementsByTagName("BODY").getLength())
.append("\n")
.append("Number of tags = ")
.append(doc.getElementsByTagName("LINK").getLength())
.append("\n")
.append("Number of tags = ")
.append(doc.getElementsByTagName("HEAD").getLength())
.append("\n")
.append("Number of tags = ")
.append(doc.getElementsByTagName("HTML").getLength())
.append("\n")
.append("Number of tags = ")
.append(doc.getElementsByTagName("META").getLength())
.append("\n")
.append("Number of
tags = ")
.append(doc.getElementsByTagName("PRE").getLength())
.append("\n")
.append("Number of
tags = ")
.append(doc.getElementsByTagName("PRE").getLength())
.append("\n")
.append("Number of
tags = ")
.append(doc.getElementsByTagName("H1").getLength())
.append("\n")
.append("Number of
tags = ")
.append(doc.getElementsByTagName("H2").getLength())
.append("\n")
.append("Number of
tags = ")
.append(doc.getElementsByTagName("H3").getLength())
.append("\n")
.append("Number of
tags = ")
.append(doc.getElementsByTagName("DIV").getLength())
.append("\n")
.append("Number of