Happy New Year!

The word "literature" has different meanings depending on who is using it. It could be applied broadly to mean any symbolic record, encompassing everything from images and sculptures to letters.
package com.zapcaster.crawl;
import java.net.MalformedURLException;
import java.net.URL;
public class GetContent {
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("java GetContent");
System.exit(-1);
}
LinkSpider spider = null;
try {
spider = new LinkSpider(args[0]); }
catch(MalformedURLException e) {
System.out.println(e);
System.out.println("Invalid URL: "+args[0]);
System.exit(-1);
}
System.out.println("Get Content:");
Long start = System.currentTimeMillis(); // performancce tracking
spider.traverse();
System.out.println("Time elapsed = " + (System.currentTimeMillis()-start));
System.out.println("Finished");
try {
byte buff[] = spider.getContent(new URL(args[0]));
StringBuffer sb = new StringBuffer();
for (byte aBuff : buff) {
sb.append((char) aBuff);
}
System.out.println("[[--[" + sb.toString() + "]--]]") ;
} catch(Exception e) { System.out.println("URL error"); }
}
}
package com.zapcaster.crawl;
import bplatt.spider.Arachnid;
import bplatt.spider.PageInfo;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
public class LinkSpider extends Arachnid {
private HashSet images;
private HashSetlinks;
public LinkSpider(String base) throws MalformedURLException
{
super(base);
super.setDelay(5);
links = new HashSet();
// no need to file out this.outdir = outdir;
}
protected void handleBadLink(URL url,URL parent, PageInfo p) { }
protected void handleBadIO(URL url, URL parent) { }
protected void handleLink(PageInfo p) {
URL[] list = p.getLinks();
int x=0;
if (list != null) {
for (URL aList : list)
if (!links.contains(aList)) {
links.add(aList);
System.out.println("#" + (++x) +"Link SAVED : " + aList.toString());
}
}
}
protected void handleNonHTMLlink(URL url, URL parent,PageInfo p) { }
protected void handleExternalLink(URL url, URL parent) { }
private void printURLs()
{
int x=0;
for (URL link : links) {
System.out.println("Link [" + (x++) + "] " + link.toString());
}
}
}
tags: spindle jobo websphinx larm metis hyperspider open source grunk arachnid web craler spider linkspider weblech heritrix simplespider comzapcastercrawl jspider
links: digg this del.icio.us technorati reddit
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Next |