package webcrawler; // Based on: http://cs.nyu.edu/courses/fall02/G22.3033-008/WebCrawler.java import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Hashtable; import java.util.Vector; public class WebCrawler { public static final int MAX_PAGES = 2000; // Absolute max pages public static final int MAXSIZE = 2000000; // Max size of file public static final boolean DEBUG = false; // URLs to be searched Vector newURLs; // Known URLs Hashtable knownURLs; String startUrl; public static void main(String[] argv) throws Exception { WebCrawler wc = new WebCrawler(); String start = argv[0]; wc.crawl(start); wc.repeat(); } // initializes data structures. argv is the command line arguments. public void initialize(String start) { URL url; knownURLs = new Hashtable(); newURLs = new Vector(); try { startUrl = start; url = new URL(startUrl); } catch (MalformedURLException e) { System.out.println("Invalid starting URL " + startUrl); return; } knownURLs.put(url, new Integer(1)); newURLs.addElement(url); System.out.println("Starting search: Initial URL " + url.toString()); System.out.println("Maximum number of pages:" + MAX_PAGES); } // adds new URL to the queue. Accept only new URL's that end in // htm or html. oldURL is the context, newURLString is the link // (either an absolute or a relative URL). public void addnewurl(URL oldURL, String newUrlString) { URL url; if (oldURL.toString().matches(".*?/[a-z0-9_-]+")) { try { oldURL = new URL(oldURL.toString() + "/"); } catch (MalformedURLException e) { throw new RuntimeException(e); } } if (DEBUG) System.out.println("URL String " + newUrlString); try { url = new URL(oldURL, newUrlString); if (!knownURLs.containsKey(url) && url.toString().startsWith(startUrl)) { knownURLs.put(url, new Integer(1)); newURLs.addElement(url); System.out.println("Found new URL " + url.toString()); } } catch (MalformedURLException e) { return; } } // Download contents of URL public String getpage(URL url, boolean printMessages) { try { // try opening the URL URLConnection urlConnection = url.openConnection(); if (printMessages) { System.out.println("Downloading " + url.toString()); } if (url.toString().contains("/examples/async/")) { System.out.println("skip async url " + url.toString()); return ""; } urlConnection.setAllowUserInteraction(false); InputStream urlStream = url.openStream(); // search the input stream for links // first, read in the entire URL byte b[] = new byte[1000]; int numRead = urlStream.read(b); String content = new String(b, 0, numRead); while ((numRead != -1) && (content.length() < MAXSIZE)) { numRead = urlStream.read(b); if (numRead != -1) { String newContent = new String(b, 0, numRead); content += newContent; } } return content; } catch (IOException e) { // 401 or 404 are expected, just log them on console if (e.getMessage().contains("Server returned HTTP response code: 401 for URL") || e instanceof FileNotFoundException) { System.out.println("ERROR: couldn't open URL: '" + url.toString()); System.out.println(e.getMessage()); e.printStackTrace(); return ""; } else { // This is unexpected, throw exception throw new RuntimeException(e); } } } // Go through page finding links to URLs. A link is signalled // by ", index); ihref = lcPage.indexOf("href", index); if (ihref != -1) { iURL = lcPage.indexOf("\"", ihref) + 1; if ((iURL != -1) && (iEndAngle != -1) && (iURL < iEndAngle)) { iCloseQuote = lcPage.indexOf("\"", iURL); iHatchMark = lcPage.indexOf("#", iURL); if ((iCloseQuote != -1) && (iCloseQuote < iEndAngle)) { iEnd = iCloseQuote; if ((iHatchMark != -1) && (iHatchMark < iCloseQuote)) iEnd = iHatchMark; String newUrlString = page.substring(iURL, iEnd); addnewurl(url, newUrlString); } } } index = iEndAngle; } } // Top-level procedure. Keep popping a url off newURLs, download // it, and accumulate new URLs public void crawl(String startUrl) { initialize(startUrl); for (int i = 0; i < MAX_PAGES; i++) { URL url = (URL) newURLs.elementAt(0); newURLs.removeElementAt(0); if (DEBUG) { System.out.println("Searching " + url.toString()); } String page = getpage(url, true); if (DEBUG) { System.out.println(page); } if (page.length() != 0) { processpage(url, page); } if (newURLs.isEmpty()) { break; } } System.out.println("Crawl complete, total " + knownURLs.size() + " pages."); } public void repeat() { for (int i = 1; i <= 500; i++) { System.out.printf("Round: %d / 500\r\n", i); for (URL url : knownURLs.keySet()) { String urls = url.toString(); if (!urls.endsWith("/manager/status") && !urls.endsWith("/manager/html") && !urls.endsWith("/host-manager/html") && !urls.endsWith("/sample") && !urls.endsWith("/coyote.html") && !urls.endsWith("/comments.html") && !urls.endsWith("/servletToJsp.java.html") && !urls.contains("/docs/api/org/apache/") && !urls.contains("/examples/async/")) { getpage(url, false); } } } } }