Seaflower - The world's first DOM crawler for vertical search

Want to crawl body text in the following HTML page?

<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Hello world</title>
</head>
<body>
<script>document.write('Hello,world!');</script>
</body>
</html>

I'm sorry that conventional web crawler can do nothing for this HTML page. They cannot execute javascript, so you'll get no results.

But we can get it by Seaflower.

What is Seaflower?

Seaflower is the world's first DOM crawler for vertical search, it's based on Firefox browser, runs on Linux sytems. It can crawl dynamic contents of web pages which generated by javascript, and can output DOM datas to xml which you can extract specific data by xpath.

Conventional web crawler has some disadvantages, such as:

1. Dynamic contents generated by javascript cannot be crawled

2. Data in misformed web page is diffcult to extract

3. Crawl web page by emulating browser instead of the true browser

Instead, Seaflower has these advantages as follows:

1. It's based on Firefox browser

2. Data using XML format

These XML data can be transformed to DOM (Document Object Model), you can use XPATH to extract contents. Get title of web page, use /html/head/title. Get all links, use //a/@href, and so on.

3. Web page data is complete and fresh

Web page data returned by Seaflower, contains dynamic datas generated by javascript.

Seaflower can output HTTP request/response headers also.

4. Multi-OS supported, Multi-threaded, run on background

5. Simple crawl protocol

Http like crawl protocol. You can get XML results by a simple GET command

6. Turn page by javascript is enabled

Seaflower provides EXEC command to execute javascript on specific url. Combining GET/CONTINUE/NODATA command, you can get web page contents continually. Seaflower also provide getNodeByXPath method for javascript, emulate click first input button, just EXEC getNodeByXPath('//input[1]').onclick()

7. Customize HTTP request headers

For example, send "HTTP-HEADER user-agent: xxx", the web server will think the client program is xxx.

8. Can disable javascript running in page

Send "disable javascript" command to Seaflower, the javascript in web page will not run instead.

Note: Seaflower isn't a spider, it's a tool of crawl. Try seaspider - the cutting-edge spider system for vertical search.

Seaflower.java Download

package com.zhsoft88.commons;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.math.NumberUtils;

/**
 * seaflower crawler
 * @author zhsoft88
 * @since 2008-4-13
 * @update 2009-6-20
 */
public class Seaflower {

	public static final int PORT = 4050;
	
	/**
	 * crawl result
	 * @author zhsoft88
	 * @since 2008-4-13
	 */
	public static class SeaflowerResult {
		private int status;
		private String title;
		private String location;
		private String contents;
		private long time;
		private List<String> requestHeaders;
		private List<String> responseHeaders;
		
		public SeaflowerResult() {
		}
		
		public int getStatus() {
			return status;
		}
		public String getTitle() {
			return title;
		}
		public String getLocation() {
			return location;
		}
		public String getContents() {
			return contents;
		}
		public long getTime() {
			return time;
		}
		public List<String> getRequestHeaders() {
			return requestHeaders;
		}
		public List<String> getResponseHeaders() {
			return responseHeaders;
		}
		protected void setStatus(int status) {
			this.status = status;
		}
		protected void setTitle(String title) {
			this.title = title;
		}
		protected void setLocation(String location) {
			this.location = location;
		}
		protected void setContents(String contents) {
			this.contents = contents;
		}
		protected void setTime(long time) {
			this.time = time;
		}
		protected void setRequestHeaders(List<String> requestHeaders) {
			this.requestHeaders = requestHeaders;
		}
		protected void setResponseHeaders(List<String> responseHeaders) {
			this.responseHeaders = responseHeaders;
		}
		@Override
		public String toString() {
			return "status="+status+",location="+location+",title="+title+",time="+time+",request-headers="+requestHeaders+",response-headers="+responseHeaders+",contents=["+contents+"]";
		}
	}
	
	/**
	 * crawl configuration
	 * @author zhsoft88
	 * @since 2008-4-13
	 * @update 2008-12-16
	 */
	public static class SeaflowerConf {
		private String url;
		private String exec;
		private int waitTime;
		private boolean cont;
		private boolean nodata;
		private List<String> httpHeaders;
		private boolean outputHttpHeaders;
		private boolean disableJavascript;
		
		public SeaflowerConf() {
		}

		public void addHttpHeader(String header) {
			if (httpHeaders==null) {
				httpHeaders = new ArrayList<String>();
			}
			httpHeaders.add(header);
		}
		
		public String getUrl() {
			return url;
		}

		public void setUrl(String url) {
			this.url = url;
		}

		public String getExec() {
			return exec;
		}

		public void setExec(String exec) {
			this.exec = exec;
		}

		public int getWaitTime() {
			return waitTime;
		}

		public void setWaitTime(int waitTime) {
			this.waitTime = waitTime;
		}

		public void setContinue(boolean cont) {
			this.cont = cont;
		}

		public boolean isNodata() {
			return nodata;
		}

		public void setNodata(boolean nodata) {
			this.nodata = nodata;
		}

		public boolean isContinue() {
			return cont;
		}

		public List<String> getHttpHeaders() {
			return httpHeaders;
		}

		public void setHttpHeaders(List<String> httpHeaders) {
			this.httpHeaders = httpHeaders;
		}

		public boolean isOutputHttpHeaders() {
			return outputHttpHeaders;
		}

		public void setOutputHttpHeaders(boolean outputHttpHeaders) {
			this.outputHttpHeaders = outputHttpHeaders;
		}

		public boolean isDisableJavascript() {
			return disableJavascript;
		}

		public void setDisableJavascript(boolean disableJavascript) {
			this.disableJavascript = disableJavascript;
		}
		
	}

	private Socket socket;
	
	public Seaflower() throws UnknownHostException, IOException {
		this("localhost");
	}
	
	public Seaflower(String host) throws UnknownHostException, IOException {
		this(host,PORT);
	}
	
	public Seaflower(String host,int port) throws UnknownHostException, IOException {
		socket = new Socket(host,port);
	}
	
	private String readUTFLine(InputStream in) throws IOException {
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		int c;
		while ((c=in.read())!=-1) {
			if (c=='\r') continue;
			if (c=='\n') break;
			baos.write(c);
		}
		return baos.toString("utf-8");
	}
	/**
	 * crawl
	 * @param conf
	 * @return
	 * @throws Exception
	 */
	public SeaflowerResult crawl(SeaflowerConf conf) throws Exception {
		if (socket==null) {
			throw new Exception("socket closed");
		}
		long t1 = System.currentTimeMillis();
		BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream()));
		if (conf.getUrl()!=null) {
			bw.write("GET "+conf.getUrl()+"\r\n");
		}
		if (conf.getExec()!=null) {
			bw.write("EXEC "+conf.getExec()+"\r\n");
		}
		if (conf.getWaitTime()!=-1) {
			bw.write("WAIT-TIME "+conf.getWaitTime()+"\r\n");
		}
		if (conf.getHttpHeaders()!=null) {
			for (String s : conf.getHttpHeaders()) {
				bw.write("HTTP-HEADER "+s+"\r\n");
			}
		}
		if (conf.isOutputHttpHeaders()) {
			bw.write("OUTPUT http-headers\r\n");
		}
		if (conf.isDisableJavascript()) {
			bw.write("DISABLE javascript\r\n");
		}
		if (conf.isContinue()) {
			bw.write("CONTINUE\r\n");
		}
		if (conf.isNodata()) {
			bw.write("NODATA\r\n");
		}
		bw.write("\r\n");
		bw.flush();
		InputStream in = socket.getInputStream();
		String line = readUTFLine(in);
		int status = -1;
		StringTokenizer st = new StringTokenizer(line," ");
		st.nextToken();
		status = NumberUtils.toInt(st.nextToken());
		String tagTitle = "Current-Title: ";
		String tagLocation = "Current-Location: ";
		String tagLength = "Content-Length: ";
		String title = null;
		String location = null;
		int length = 0;
		while ((line=readUTFLine(in))!=null) {
			if (line.length()==0) break;
			if (line.startsWith(tagTitle)) {
				title = line.substring(tagTitle.length());
			} else if (line.startsWith(tagLocation)) {
				location = line.substring(tagLocation.length());
			} else if (line.startsWith(tagLength)) {
				length = NumberUtils.toInt(line.substring(tagLength.length()));
			}
		}
		ByteArrayOutputStream baos = new ByteArrayOutputStream(length);
		byte[] ba = new byte[4096];
		while (length>0) {
			int len = in.read(ba);
			baos.write(ba, 0, len);
			length -= len;
		}
		String contents = baos.toString("utf-8");
		if (!conf.isContinue()) {
			socket.close();
			socket = null;
		}
		long t2 = System.currentTimeMillis();
		SeaflowerResult result = new SeaflowerResult();
		result.setStatus(status);
		result.setTitle(title);
		result.setLocation(location);
		result.setTime(t2-t1);
		result.setContents(contents);
		List<String> requestHeaders = new ArrayList<String>();
		List<String> responseHeaders = new ArrayList<String>();
		if (conf.isOutputHttpHeaders()) {
			BufferedReader hbr = new BufferedReader(new StringReader(result.getContents()));
			boolean found = false;
			while ((line=hbr.readLine())!=null) {
				if (line.equals("<!-- @REQUEST-HEADERS [")) {
					found = true;
					break;
				}
			}
			if (found)
			{
				while ((line=hbr.readLine())!=null) {
					if (line.equals("] -->")) break;
					requestHeaders.add(line);
				}
			}
			found = false;
			while ((line=hbr.readLine())!=null) {
				if (line.equals("<!-- @RESPONSE-HEADERS [")) {
					found = true;
					break;
				}
			}
			if (found)
			{
				while ((line=hbr.readLine())!=null) {
					if (line.equals("] -->")) break;
					responseHeaders.add(line);
				}
			}
			result.setContents(IOUtils.toString(hbr));
		}
		result.setRequestHeaders(requestHeaders);
		result.setResponseHeaders(responseHeaders);
		return result;
	}
	
}

TestSeaflower.java Download

package com.zhsoft88.commons.tests;

import com.zhsoft88.commons.Seaflower;
import com.zhsoft88.commons.Seaflower.SeaflowerConf;
import com.zhsoft88.commons.Seaflower.SeaflowerResult;

public class TestSeaflower {

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		SeaflowerConf conf = new SeaflowerConf();
		conf.setUrl("http://www.google.com");
		conf.addHttpHeader("user-agent: jiong/0.1");
		conf.addHttpHeader("kkkkkkk: vvvvvvvvvvvvvvv");
		conf.setOutputHttpHeaders(true);
		Seaflower sf = new Seaflower();
		SeaflowerResult result = sf.crawl(conf);
		System.out.println(result.getRequestHeaders());
		System.out.println(result.getResponseHeaders());
		System.out.println(result.getContents());
	}

}

TestSeaflower4.java Download

Crawl www.ourku.com, extract specific data

package com.zhsoft88.commons.tests;

import java.util.List;

import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;

import com.zhsoft88.commons.Seaflower;
import com.zhsoft88.commons.Seaflower.SeaflowerConf;
import com.zhsoft88.commons.Seaflower.SeaflowerResult;

/**
 * Test of Seaflower: 抓取酷基金网站数据
 * @author zhsoft88
 * @since 2009-6-20
 */
public class TestSeaflower4 {

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		Seaflower s = new Seaflower();
		{
			SeaflowerConf conf = new SeaflowerConf();
			conf.setUrl("http://www.ourku.com/index_kfjj.html");
			conf.setWaitTime(10);
			SeaflowerResult result = s.crawl(conf);
			Document doc = DocumentHelper.parseText(result.getContents());
			List<Node> list = doc.selectNodes("//table[@id='ilist']//tr[@class='tr']");
			System.out.println("total size="+list.size());
			for (Node no : list) {
				for (int i=1;i<=8;i++) {
					System.out.print(no.selectSingleNode("td["+i+"]").getStringValue());
					if (i!=8) System.out.print(", ");
				}
				System.out.println();
			}
		}
		
	}

}

TestSeaflower3Cont.java Download

Click emulation, extract image source link.

package test;

import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;

import com.zhsoft88.commons.Seaflower;
import com.zhsoft88.commons.Seaflower.SeaflowerConf;
import com.zhsoft88.commons.Seaflower.SeaflowerResult;

public class TestSeaflower3Cont {

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		Seaflower sf = new Seaflower();
		{
			SeaflowerConf conf = new SeaflowerConf();
			conf.setContinue(true);
			conf.setNodata(true);
			conf.setWaitTime(10);
			conf.setUrl("http://www.gap.com/browse/product.do?cid=8793&vid=1&pid=655053");
			sf.crawl(conf);
		}
		for (char c='0';c<='4'; c++)
		{
			SeaflowerConf conf = new SeaflowerConf();
			conf.setContinue(c!='4');
			conf.setExec("document.getElementById('colorSwatch_"+c+"').onclick()");
			SeaflowerResult result = sf.crawl(conf);
			Document doc = DocumentHelper.parseText(result.getContents());
			Attribute a = (Attribute)doc.selectSingleNode("//img[@id='dragImg']/@src");
			System.out.println(a.getValue());
		}
	}

}

About Seaflower

Download

Install

Seaflower server management

Tool of configuration management - seaflowerctl

Register

Seaflower Protocol

Example codes

Seaflower.java Download

TestSeaflower.java Download

TestSeaflower4.java Download

TestSeaflower3Cont.java Download

Crawl data in next web page