一个简单的网页抓取例子 - JAVA

package net;  
  
  
import java.io.BufferedReader;  
import java.io.IOException;  
import java.io.InputStreamReader;  
import java.net.MalformedURLException;  
import java.net.URL;  
import java.util.ArrayList;  
import java.util.HashMap;  
import java.util.Iterator;  
import java.util.List;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
  
public class WebContent {  
    /** 
     * 读取一个网页全部内容 
     *  
     * @param htmlurl 
     * @return String 网页内容 
     * @throws IOException 
     */  
    public String getOneHtml(final String htmlurl) throws IOException {  
        URL url;  
        String temp;  
        final StringBuffer htmlContent = new StringBuffer();  
        try {  
            url = new URL(htmlurl);  
            System.out.println(url.getProtocol());  
            final BufferedReader in = new BufferedReader(new InputStreamReader(  
                    url.openStream(), "utf-8"));// 读取网页全部内容  
            while ((temp = in.readLine()) != null) {  
                htmlContent.append(temp);  
            }  
            in.close();  
        } catch (final MalformedURLException me) {  
            System.out.println("你输入的URL格式有问题！请仔细输入");  
            me.getMessage();  
            throw me;  
        } catch (final IOException e) {  
            e.printStackTrace();  
            throw e;  
        }  
        return htmlContent.toString();  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得网页标题 
     */  
    public String getTitle(final String s) {  
        String regex = ".* ";  
        ;  
        String title = "";  
        final List list = new ArrayList();  
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);  
        final Matcher ma = pa.matcher(s);  
        while (ma.find()) {  
            list.add(ma.group());  
        }  
        for (int i = 0; i < list.size(); i++) {  
            title = title + list.get(i);  
        }  
        return outTag(title);  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得链接 
     */  
    public List getLink(final String s) {  
        String regex;  
        final List list = new ArrayList();  
        regex = "]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.* )";  
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);  
        final Matcher ma = pa.matcher(s);  
        while (ma.find()) {  
            list.add(ma.group());  
        }  
        return list;  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得脚本代码 
     */  
    public List




 getScript(final String s) {  
        String regex;  
        final List list = new ArrayList();  
        regex = "";  
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);  
        final Matcher ma = pa.matcher(s);  
        while (ma.find()) {  
            list.add(ma.group());  
        }  
        return list;  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得CSS 
     */  
    public List getCSS(final String s) {  
        String regex;  
        final List list = new ArrayList();  
        regex = "";  
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);  
        final Matcher ma = pa.matcher(s);  
        while (ma.find()) {  
            list.add(ma.group());  
        }  
        return list;  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 去掉标记 
     */  
    public String outTag(final String s) {  
        return s.replaceAll("<.* >", "");  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 
     */  
    public HashMap> getFromUrls(final String url) {  
        final HashMap> result = new HashMap>();  
        String content = "";  
        System.out.println("\n------------------开始读取网页(" + url  
                + ")--------------------");  
        try {  
            content = getOneHtml(url);  
        } catch (final Exception e) {  
            e.getMessage();  
            return null;  
        }  
        System.out.println("------------------读取网页(" + url  
                + ")结束--------------------\n");  
        System.out.println("------------------分析网页(" + url  
                + ")结果如下--------------------\n");  
        List title = new ArrayList();  
        title.add(getTitle(content));  
        result.put("title", title);  
        result.put("css", getCSS(content));  
        result.put("script", getScript(content));  
        result.put("link", getLink(content));  
        return result;  
    }  
  
  
    /** 
     * @param args 
     */
一个简单的网页抓取例子(一)