设为首页 加入收藏

TOP

kafka+sparkstreaming+hbase
2019-01-06 01:49:54 】 浏览:37
Tags:kafka sparkstreaming hbase

需求

kafka中会不断产生用户的操作日志,主要内容为(userid,operation,time),在hbase中存储了(userid,cityid)

需要统计每5分钟内不同的城市有过多少次操作


思路:

1.先处理kafka的用户日志,统计每5分钟会有多少个(userid)

2.再通过查询hbase的数据将userid映射为对应的cityid

3.此时的数据应该为(time,cityid,1),再做一次reducebykey即可


kafka模拟生成操作日志的方法不再赘述

可以查看之前那篇kafka实战里面有提到


hbase 中,将userid作为rowkey,cityid为列


代码部分

package main.scala
//需求:
//1.hbase中有数据 (用户id,城市id)000001 shanghai
//2.kafka中不停地生产用户操作日志  (用户id,操作,时间) 000001 w 2018-5-17 09:00
//3.需要每5分钟统计一次操作日志的用户的数量,按城市划分
//kafka topic: city_count

import java.sql.Date
import java.text.SimpleDateFormat
import java.util.Properties

import kafka.serializer.StringDecoder
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Duration, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import com.alibaba.fastjson.JSON
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
//import org.apache.hadoop.hbase.client.{ConnectionFactory, Get}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Get}
//import kafka.producer.ProducerConfig
//import kafka.producer.Producer
//import kafka.producer.KeyedMessage
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import 
		    

org.apache.hadoop.hbase.client import org.apache.hadoop.hbase.util.Bytes object kafka_spark_hbase { //建立与hbase的连接 val configuration = HBaseConfiguration.create() configuration.set("hbase.zookeeper.property.clientPort", "2181") configuration.set("hbase.zookeeper.quorum", "localhost") val connection = ConnectionFactory.createConnection(configuration) val admin = connection.getAdmin()//获取admin val table = connection.getTable(TableName.valueOf("user_city"))//获取hbase的表 val table2 = connection.getTable(TableName.valueOf("time_city")) def main(args: Array[String]) { val sparkConf = new SparkConf().setMaster("local[2]").setAppName("kafka-spark-demo") val scc = new org.apache.spark.streaming.StreamingContext(sparkConf, Duration(10000))//new一个spark-streaming的上下文 // scc.checkpoint(".") // 暂时用不到 val topics = Set("city_count") //我们需要消费的kafka数据的topic val kafkaParam = Map( "metadata.broker.list" -> "localhost:9092", // kafka的broker list地址 "auto.offset.reset" -> "smallest"//这个参数可以让streaming消费topic的时候从头开始消费 ) val stream: InputDStream[(String, String)] = createStream(scc, kafkaParam, topics)//建立流,读数据,传入上下文,kafka配置,主题名字 val wordCount = stream.map(l=>json_an(l._2)).map(l=>(deal_user_city(l.toString),1)).reduceByKey(_+_)//处理 //将处理好的数据存回hbase wordCount.map{line=> val time=line._1.toString val city=line._2.toString val p = new Put(time.getBytes) //为put操作指定 column 和 value (以前的 put.add 方法被弃用了) p.addColumn("info".getBytes, "city".getBytes, city.getBytes) //提交 table2.put(p) "store successs!" }.print() // wordCount.print()//输出到控制台看看结果 scc.start() // 真正启动程序 scc.awaitTermination() //阻塞等待 } /** * 创建一个从kafka获取数据的流. * * @param scc spark streaming上下文 * @param kafkaParam kafka相关配置 * @param topics 需要消费的topic集合 * @return */ def createStream(scc: StreamingContext, kafkaParam: Map[String, String], topics: Set[String]) = { KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](scc, kafkaParam, topics) } //处理时间 def formatData(line: String) = { val date = new SimpleDateFormat("yyyy-MM-dd H:mm") val d = new SimpleDateFormat("yyyy-MM-dd") val dateFormated = date.parse(line) val dateFormated3 = date.parse(line.split(" ")(0) + " 0:0") val dateFormated2 = date.format(dateFormated) val dateFormated4 = date.format(dateFormated3) val dateFf = date.parse(dateFormated2).getTime val dateFf2 = date.parse(dateFormated4).getTime val r = dateFf - dateFf2 val hash = r / 300000 val final_date = new Date(hash.toInt * 300000 + dateFf2) date.format(final_date) } // def deal_user_city(str:String)={ //(time,id) val str_1=str.split(',') val id=str_1(1).split(')')(0) //通过Get方法查询hbase的数据进行匹配 val g = new Get(id.getBytes) val result = table.get(g) //获取对应列的值 val value = Bytes.toString(result.getValue("info".getBytes,"city".getBytes)) str_1(0).split('(')(1)+' '+value } //字符串处理。在这里是提取时间 def json_an(str: String) = { if (str.length < 10) { 1 } else { val json = JSON.parseObject(str) val main_v = json.get("payload") val v= main_v.toString.split(",") if (v.length == 3) { (formatData(v(2)),v(0)) } else { "NAN" } } } }

在此说明一下处理kafka的流时为什么要用json处理

因为使用的是connect模式,在该模式下,写入topic的内容就是json格式


pom.xml

<xml version="1.0" encoding="UTF-8">
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.huangxiao</groupId>
    <artifactId>streaming</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka_2.11</artifactId>
            <version>1.6.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.47</version>
        </dependency>
        <!---->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.4.4</version>
            <exclusions>
                <exclusion>
                    <groupId>io.netty</groupId>
                    <artifactId>netty-all</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

    </dependencies>
</project>


编程开发网
】【打印繁体】【投稿】【收藏】 【推荐】【举报】【评论】 【关闭】 【返回顶部
上一篇如何开启hbase 下一篇mapreduce输出数据存入HBase中

评论

帐  号: 密码: (新用户注册)
验 证 码:
表  情:
内  容:

array(4) { ["type"]=> int(8) ["message"]=> string(24) "Undefined variable: jobs" ["file"]=> string(32) "/mnt/wp/cppentry/do/bencandy.php" ["line"]=> int(217) }