用了13天, 今天终于看完了毕向东的<<Java基础入门(25天)>>.
最后一个视频里的例子是用正则表达式抓邮箱. 看完之后, 稍微改了一下.
[java]
/*
需求:
从web.txt文件中存的网址里抓取邮箱地址, 存在mail.txt文件中.
思路:
1, 从web.txt中读取网址, 建立连接, 用SequenceInputStream将所有网页源文件存在source.txt中.
2, 调用getMails方法抓邮箱, 抓到的邮箱存在mail.txt中.
*/
import java.net.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;
public class Spider
{
public static void main(String[] args) throws Exception
{
System.out.println("please wait..");
getWebSource();
System.out.println("get source success!");
getMails();
System.out.println("you have done.");
}
public static void getWebSource() throws Exception
{
Vector
BufferedReader bufr =
new BufferedReader(new FileReader("web.txt"));
String line = null;
while((line=bufr.readLine()) != null)
{
URL url = new URL(line);
v.add(url.openStream());
}
Enumeration
SequenceInputStream sis = new SequenceInputStream(en);
BufferedReader bufIn =
new BufferedReader(new InputStreamReader(sis));
BufferedWriter bufOut =
new BufferedWriter(new FileWriter("source.txt"));
String lineIn = null;
while((lineIn=bufIn.readLine()) != null)
{
System.out.println(lineIn);
bufOut.write(lineIn);
}
bufr.close();
bufIn.close();
bufOut.close();
}
public static void getMails() throws Exception
{
BufferedReader bufIn =
new BufferedReader(new FileReader("source.txt"));
PrintWriter out =
new PrintWriter(new FileOutputStream("mail.txt"), true);
String line = null;
String reg = "\\w+@[a-zA-Z]+(\\.[a-zA-Z]+)+";
Pattern p = Pattern.compile(reg);
while((line=bufIn.readLine()) != null)
{
Matcher m = p.matcher(line);
while(m.find())
{
out.println(m.group());
}
}
bufIn.close();
out.close();
}
}