设为首页 加入收藏

TOP

Linux下Java语言实现简陋Web爬虫
2014-11-24 02:26:15 来源: 作者: 【 】 浏览:2
Tags:Linux Java 语言 实现 简陋 Web 爬虫

Linux环境下Java语言实现简陋Web爬虫:


import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.Socket;
import java.net.UnknownHostException;


public class WebCrawler {


private static String Text_File_Path = "/home/zms/htmldoc/htmldoc1.html";


//运行前最好先建立此目录和文件,用于存放爬取的页面内容
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
File file = new File(Text_File_Path);
FileWriter fpWriter = new FileWriter(file);

//生成下载对象
Socket webclient = new Socket("ubuntuone.cn",80);
PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);
BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream()));

//发送Http请求
result.println("GET / HTTP/1.1");
result.println("Host:localhost");
result.println("Connection: Close");
result.println();

//接收HTTP返回的消息
boolean bRet = true;
StringBuffer sb = new StringBuffer(8096);
while(bRet){
if(receiver.ready()){
int idx = 0;
while(idx != -1){
idx = receiver.read();
if(idx == '<')
break;
}
while(idx != -1){
sb.append((char)idx);
idx = receiver.read();
}
bRet = false;
}
}

//显示获得网页的正文,打印到控制台
System.out.println(sb.toString());
fpWriter.write(sb.toString());
webclient.close();
fpWriter.close();

} catch (UnknownHostException e) {
System.err.println("无法访问您指定的主机。");
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
System.err.println("下载失败,请检查输入地址是否正确。");
e.printStackTrace();
System.exit(1);
}
}


}


】【打印繁体】【投稿】【收藏】 【推荐】【举报】【评论】 【关闭】 【返回顶部
分享到: 
上一篇Linux 编译安装Wireshark出现的问.. 下一篇在Ubuntu下建立C/C++编程环境(C和..

评论

帐  号: 密码: (新用户注册)
验 证 码:
表  情:
内  容: