1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | import java.io.IOException; import java.net.URL; import org.htmlparser.Node; import org.htmlparser.Tag; import org.htmlparser.lexer.Lexer; import org.htmlparser.util.ParserException; /** * Parses the HTML code of the page specified by it's URL. * @author gabriel.solano * */ public class URLHTMLParser { /* * Tag handler that will be used to process the tags. * (This could be improved by implementing an observer * pattern to be able to add more than one TagHandler) */ private TagHandler tagHandler; /** * Constructor. * @param tagHandler */ public URLHTMLParser(TagHandler tagHandler) { this.tagHandler = tagHandler; } /** * Scans the specified URL. * @param url * @throws ParserException * @throws IOException */ public void scanURL(URL url) throws ParserException, IOException { Lexer lexer = new Lexer(url.openConnection()); extractHTMLNodes(lexer); } /** * Extracts the HTML nodes and lets the TagHandler to do something * with the tags. * @param lexer * @throws ParserException */ private void extractHTMLNodes(Lexer lexer) throws ParserException { Node node; while ( null != (node = lexer.nextNode( false ))) { if (node instanceof Tag) { Tag tag = (Tag) node; tagHandler.handleTag(tag); } } } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import org.htmlparser.Tag; /** * Defines the interface for a TagHandler. * @author gabriel.solano * */ public interface TagHandler { /** * Handles the process of an HTML tag. * @param tag */ public void handleTag(Tag tag); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import java.util.HashSet; import java.util.Set; import org.htmlparser.Tag; /** * Handles the event when an anchor tag is found while parsing * HTML code of a page. * This class has a functionality to count all absolute URLs * found in the parsing process. * @author gabriel.solano * */ public class AnchorTagHandler implements TagHandler{ private Set<String> absoluteURLs; // All URLs found. /** * Constructor. */ public AnchorTagHandler() { absoluteURLs = new HashSet<String>(); } /** * Gets the found absolute URLs. * The collection is filled only during the scanning process * of an HTML page. * @return */ public Set<String> getAbsoluteURLs() { return absoluteURLs; } /** * Handles the tag only if it is an anchor tag. */ public void handleTag(Tag tag) { if (tag.getTagName().equalsIgnoreCase( "a" )) { // Process only if it's an anchor tag. processTag(tag); } } /** * Processes the anchor tag. In this case * adds all absolute URL's found. * @param tag */ private void processTag(Tag tag) { String href = tag.getAttribute( "href" ); if (href != null ) { href = href.toLowerCase(); // Add all URLs with HTTP protocol. absoluteURLs.add(href); } } } } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import java.net.URL; import java.util.Set; public class FindAbsoluteURLs { public static void main(String[] args) { AnchorTagHandler anchorTagHandler = new AnchorTagHandler(); URLHTMLParser htmlParser = new URLHTMLParser(anchorTagHandler); try { Set<String> urls = anchorTagHandler.getAbsoluteURLs(); for (String url : urls) { System.out.println(url); } } catch (Exception e) { e.printStackTrace(); } } } |
1 2 3 4 5 6 7 | < java > < dependency > < groupId >org.htmlparser</ groupId > < artifactId >htmlparser</ artifactId > < version >1.6</ version > </ dependency > </ java > |