一个简单的网页抓取例子(一)

2014-11-24 09:24:19 · 作者: · 浏览: 5
package net;  
  
  
import java.io.BufferedReader;  
import java.io.IOException;  
import java.io.InputStreamReader;  
import java.net.MalformedURLException;  
import java.net.URL;  
import java.util.ArrayList;  
import java.util.HashMap;  
import java.util.Iterator;  
import java.util.List;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
  
  
public class WebContent {  
    /** 
     * 读取一个网页全部内容 
     *  
     * @param htmlurl 
     * @return String 网页内容 
     * @throws IOException 
     */  
    public String getOneHtml(final String htmlurl) throws IOException {  
        URL url;  
        String temp;  
        final StringBuffer htmlContent = new StringBuffer();  
        try {  
            url = new URL(htmlurl);  
            System.out.println(url.getProtocol());  
            final BufferedReader in = new BufferedReader(new InputStreamReader(  
                    url.openStream(), "utf-8"));// 读取网页全部内容  
            while ((temp = in.readLine()) != null) {  
                htmlContent.append(temp);  
            }  
            in.close();  
        } catch (final MalformedURLException me) {  
            System.out.println("你输入的URL格式有问题!请仔细输入");  
            me.getMessage();  
            throw me;  
        } catch (final IOException e) {  
            e.printStackTrace();  
            throw e;  
        }  
        return htmlContent.toString();  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得网页标题 
     */  
    public String getTitle(final String s) {  
        String regex = ".* ";  
        ;  
        String title = "";  
        final List list = new ArrayList();  
        final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);  
        final Matcher ma = pa.matcher(s);  
        while (ma.find()) {  
            list.add(ma.group());  
        }  
        for (int i = 0; i < list.size(); i++) {  
            title = title + list.get(i);  
        }  
        return outTag(title);  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得链接 
     */  
    public List getLink(final String s) {  
        String regex;  
        final List list = new ArrayList();  
        regex = "]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.* )";  
        final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);  
        final Matcher ma = pa.matcher(s);  
        while (ma.find()) {  
            list.add(ma.group());  
        }  
        return list;  
    }  
  
  
    /** 
     *  
     * @param s 
     * @return 获得脚本代码 
     */  
    public List
getScript(final String s) { String regex; final List list = new ArrayList(); regex = ""; final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } return list; } /** * * @param s * @return 获得CSS */ public List getCSS(final String s) { String regex; final List list = new ArrayList(); regex = ""; final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } return list; } /** * * @param s * @return 去掉标记 */ public String outTag(final String s) { return s.replaceAll("<.* >", ""); } /** * * @param s * @return */ public HashMap> getFromUrls(final String url) { final HashMap> result = new HashMap>(); String content = ""; System.out.println("\n------------------开始读取网页(" + url + ")--------------------"); try { content = getOneHtml(url); } catch (final Exception e) { e.getMessage(); return null; } System.out.println("------------------读取网页(" + url + ")结束--------------------\n"); System.out.println("------------------分析网页(" + url + ")结果如下--------------------\n"); List title = new ArrayList(); title.add(getTitle(content)); result.put("title", title); result.put("css", getCSS(content)); result.put("script", getScript(content)); result.put("link", getLink(content)); return result; } /** * @param args */