1. 准备工作
Kafka集群的搭建可以参考 Kafka集群搭建与配置
Spark集群的搭建可以参考Hadoop+HBase+Spark+Hive环境搭建
2. 编写代码(scala实现)
引入pom依赖
<properties>
<kafka.version>2.0.0</kafka.version>
<spark.version>2.3.1</spark.version>
<scala.version>2.11</scala.version></properties><dependencies>
<!--spark-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- log4j -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.scalanlp</groupId>
<artifactId>breeze-viz_2.11</artifactId>
<version>0.13.2</version>
</dependency></dependencies>生产者,每秒发送数据"yang yun ni hao sha a"
package com.whut.demoimport java.utilimport org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
object KafkaProducer {
def main(args: Array[String]) { //设置代理节点和主题
val brokers = "192.168.1.41:9092,192.168.1.42:9092,192.168.1.47:9092" //zookeeper代理节点
val inputTopic = "input-streaming-topic" //topic
//设置zookeeper连接属性
val props = new util.HashMap[String, Object]()
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer") //初始化生产者
val producer = new KafkaProducer[String, String](props) //发送消息
while(true) {
val key = null
val value = "yang yun ni hao sha a"
val message = new ProducerRecord[String,String](inputTopic, key, value)
producer.send(message)
println(message)
Thread.sleep(1000)
}
}
}Spark Streaming作消费者,每2秒统计一次最近10秒每个单词出现的次数
package com.whut.demoimport org.apache.spark.streaming._import org.apache.spark.SparkConfimport org.apache.spark.streaming.kafka010.KafkaUtilsimport org.apache.kafka.common.serialization.StringDeserializerimport org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistentimport org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
object SparkConsumer{
def main(args:Array[String]){ /**
* 设置spark master
* 单机模式: "local[*]"
* 集群模式: "spark://192.168.1.32:7077"
*/
val master = "local[*]"
/**
* 设置checkpoint路径
* 单机模式: "checkpoint"
* 集群模式: "hdfs://master:9000/user/checkpoint"
*/
val checkpoint = "checkpoint"
//设置日志等级
LogConfig.setStreamingLogLevels() //设置批处理间隔,单位秒
val batchDuration = 1
//设置输入流的topic
val inputTopic = "input-streaming-topic"
//设置输出流的topic
val outputTopic = "output-streaming-topic"
//初始化streamingContext
val streamingContext = new StreamingContext( new SparkConf().setAppName(s"${this.getClass.getSimpleName}").setMaster(master),
Seconds(batchDuration)
)
streamingContext.checkpoint(checkpoint) //kafka配置
val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "192.168.1.41:9092,192.168.1.42:9092,192.168.1.47:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "1", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean)
) //创建DStream
val dStream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](Array(inputTopic), kafkaParams)
) //对接收到的一个DStream进行解析
val lines = dStream.map(record => (record.key, record.value))
val words = lines.map(_._2)
val word = words.flatMap(_.split(" "))
val pair = word.map(x => (x,1)) //窗口长度设置为10秒,窗口滑动距离设置为2秒
val wordCounts = pair.reduceByKeyAndWindow(_ + _, _ - _, Seconds(10), Seconds(2))
wordCounts.print
streamingContext.start
streamingContext.awaitTermination
}
}为了使得输出更加简洁,我们还要设置一下日志等级
package com.whut.demoimport org.apache.spark.internal.Loggingimport org.apache.log4j.{Level, Logger}
object LogConfig extends Logging { def setStreamingLogLevels() {
val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) {
logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.")
Logger.getRootLogger.setLevel(Level.WARN)//警告以上级别才打印
}
}
}3. 集群模式下流处理任务的提交
单机模式
直接在IDEA中运行两个程序即可
集群模式
KafkaProducer 可以直接在IDEA中运行,但SparkConsumer需要将打成jar包,然后用spark-submit提交任务
Idea下,将程序打成jar包(参考IDEA 打jar,提交spark集群运行)
将Jar包上传到HDFS(下面是可能会用到的hdfs命令)
hdfs dfs -ls / hdfs dfs -rm /SparkConsumer.jar hdfs dfs -put SparkConsumer.jar /
spark提交任务
spark-submit --class com.whut.demo.SparkConsumer --master spark://master:7077 hdfs://mast
作者:杨赟快跑
链接:https://www.jianshu.com/p/f5d22edb5127
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦