hadoop，用java 和 python 实现 worldcount 简单单词提取累加小程序 - Hadoop

TOP

hadoop，用java 和 python 实现 worldcount 简单单词提取累加小程序

2019-03-05 00:38:59 【大中小】浏览:165次

Tags：hadoop java python 实现 worldcount 简单单词提取加小程序

package com.demo;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public  class worldcount {
    
	public static void countWords(String str){

        Map<String, Integer> map=new HashMap<String, Integer>();        
        Pattern p=Pattern.compile("\\b[a-zA-Z-]+\\b");//正则表达式
        Matcher m=p.matcher(str);// / 操作的字符串 
        
        boolean b = m.find();
        
        System.out.println(b);        
        while(m.find()){
            String mstr=m.group();
            if(map.containsKey(mstr)){ //判断是否存在
                map.put(mstr,map.get(mstr)+1); 
            }else{
                map.put(mstr, 1);
            }
        }
        Set<Entry<String, Integer>> entrySet = map.entrySet();
        Iterator<Entry<String,Integer>> it=entrySet.iterator();
        while(it.hasNext()){
            Entry<String, Integer> next = it.next();
            System.out.println(next.getKey()+" 个数:"+next.getValue());
        }
    } 
	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		 String encoding = "GBK";
		 
		 File file = new File("E://linux/README.txt"); 
		 
		 FileInputStream fileInputStream = new  FileInputStream(file);
		 
		 InputStreamReader inputStreamReader = new InputStreamReader((fileInputStream),encoding);
		 
		 StringBuffer sb  = new StringBuffer();		 
		 try {
			// 把从文件中读出啦的字节流转化为字符流；
			
			BufferedReader bufferedReader = new BufferedReader(inputStreamReader);//对字符串 进行缓冲区操作  以便度出来
			 String lineTxt = null;

		     while ((lineTxt = bufferedReader.readLine()) != null) {		    	 
                     System.out.println(lineTxt);
                     worldcount.countWords(lineTxt);
	         }  	
		     
		   
		} catch (UnsupportedEncodingException e) {
		
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		  inputStreamReader.close();
	}
}

python

# -*- coding=utf-8 -*-
import codecs

def readFile(filename):  
    f = open(filename, "r", encoding="utf-8")  
    contents = f.readlines()  
    f.close()  
    return contents  

def outputFile(filename, resultDict):
    f=codecs.open(filename, "w", "utf-8")
    
    for key in resultDict.keys():
        list=[str(key),':',str(resultDict[key])]
        line=''.join(list)
        print(line)
        f.write(line)
        f.write('\r\n')
    
    f.close()

def wordCount(contents):
    resultDict={}
    
    for line in contents:
        sList=line.split(",")
        for sen in sList:
            words=sen.split(" ")
            for word in words:
                word=word.strip()
                if word=="":
                    continue
                
                if word not in resultDict.keys():
                    resultDict[word]=1
                else:
                    resultDict[word]=resultDict[word]+1
    
    print(resultDict)
    
    return resultDict

if __name__ == '__main__':
    print('start read file')
    contents=readFile("readme.txt")
    
    print('start word count')
    resultDict=wordCount(contents)
    
    print('output result')
    outputFile("woutput.txt", resultDict)

mapreduce

package hdfs;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {


  public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }

  public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(WordCount.class);//设置一个用户定义的job名称
    job.setMapperClass(TokenizerMapper.class);//为job设置Mapper类
    job.setCombinerClass(IntSumReducer.class);//为job设置Combiner类
    job.setReducerClass(IntSumReducer.class);//为job设置Reducer类
    job.setOutputKeyClass(Text.class);//为job的输出数据设置Key类
    job.setOutputValueClass(IntWritable.class);//为job输出设置value类
    FileInputFormat.addInputPath(job, new Path(args[0]));//设置一个用户定义的job名称
    FileOutputFormat.setOutputPath(job, new Path(args[1]));//设置一个用户定义的job名称
    System.exit(job.waitForCompletion(true)  0 : 1);//设置一个用户定义的job名称
  }
}


【大中小】【打印】【繁体】【投稿】【收藏】【推荐】【举报】【评论】【关闭】【返回顶部】

上一篇：如何在Centos下安装hadoop并与Ecl..	下一篇：hadoop各版本下载地址