抓取指定网页中的邮箱地址

2014-11-24 11:42:09 · 作者: · 浏览: 6

用了13天, 今天终于看完了毕向东的<<Java基础入门(25天)>>.

最后一个视频里的例子是用正则表达式抓邮箱. 看完之后, 稍微改了一下.

[java]
/*
需求:
从web.txt文件中存的网址里抓取邮箱地址, 存在mail.txt文件中.

思路:
1, 从web.txt中读取网址, 建立连接, 用SequenceInputStream将所有网页源文件存在source.txt中.
2, 调用getMails方法抓邮箱, 抓到的邮箱存在mail.txt中.
*/

import java.net.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;

public class Spider
{
public static void main(String[] args) throws Exception
{
System.out.println("please wait..");
getWebSource();
System.out.println("get source success!");
getMails();
System.out.println("you have done.");
}

public static void getWebSource() throws Exception
{
Vector v = new Vector();

BufferedReader bufr =
new BufferedReader(new FileReader("web.txt"));

String line = null;
while((line=bufr.readLine()) != null)
{
URL url = new URL(line);

v.add(url.openStream());
}

Enumeration en = v.elements();

SequenceInputStream sis = new SequenceInputStream(en);

BufferedReader bufIn =
new BufferedReader(new InputStreamReader(sis));

BufferedWriter bufOut =
new BufferedWriter(new FileWriter("source.txt"));

String lineIn = null;
while((lineIn=bufIn.readLine()) != null)
{
System.out.println(lineIn);
bufOut.write(lineIn);
}

bufr.close();
bufIn.close();
bufOut.close();
}

public static void getMails() throws Exception
{
BufferedReader bufIn =
new BufferedReader(new FileReader("source.txt"));

PrintWriter out =
new PrintWriter(new FileOutputStream("mail.txt"), true);

String line = null;
String reg = "\\w+@[a-zA-Z]+(\\.[a-zA-Z]+)+";

Pattern p = Pattern.compile(reg);

while((line=bufIn.readLine()) != null)
{
Matcher m = p.matcher(line);

while(m.find())
{
out.println(m.group());
}
}

bufIn.close();
out.close();
}
}