diff --git a/code/Storm/storm-hdfs-integration/pom.xml b/code/Storm/storm-hdfs-integration/pom.xml
index f65e03c..643b3de 100644
--- a/code/Storm/storm-hdfs-integration/pom.xml
+++ b/code/Storm/storm-hdfs-integration/pom.xml
@@ -9,7 +9,6 @@
1.0
- UTF-8
1.2.2
@@ -20,6 +19,53 @@
+
+
+ org.apache.storm
+ storm-core
+ ${storm.version}
+
+
+
+ org.apache.storm
+ storm-hdfs
+ ${storm.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+ 2.6.0-cdh5.15.2
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+ org.apache.hadoop
+ hadoop-client
+ 2.6.0-cdh5.15.2
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+ org.apache.hadoop
+ hadoop-hdfs
+ 2.6.0-cdh5.15.2
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+
@@ -83,53 +129,4 @@
-
-
-
- org.apache.storm
- storm-core
- ${storm.version}
-
-
-
- org.apache.storm
- storm-hdfs
- ${storm.version}
-
-
- org.apache.hadoop
- hadoop-common
- 2.6.0-cdh5.15.2
-
-
- org.slf4j
- slf4j-log4j12
-
-
-
-
- org.apache.hadoop
- hadoop-client
- 2.6.0-cdh5.15.2
-
-
- org.slf4j
- slf4j-log4j12
-
-
-
-
- org.apache.hadoop
- hadoop-hdfs
- 2.6.0-cdh5.15.2
-
-
- org.slf4j
- slf4j-log4j12
-
-
-
-
-
-
\ No newline at end of file
diff --git a/code/Storm/storm-kafka-integration/pom.xml b/code/Storm/storm-kafka-integration/pom.xml
index f1ffe16..48afea3 100644
--- a/code/Storm/storm-kafka-integration/pom.xml
+++ b/code/Storm/storm-kafka-integration/pom.xml
@@ -8,6 +8,29 @@
storm-kafka-integration
1.0
+
+ 1.2.2
+ 2.2.0
+
+
+
+
+ org.apache.storm
+ storm-core
+ ${storm.version}
+
+
+ org.apache.storm
+ storm-kafka-client
+ ${storm.version}
+
+
+ org.apache.kafka
+ kafka-clients
+ ${kafka.version}
+
+
+
@@ -68,27 +91,4 @@
-
- 1.2.2
- 2.2.0
-
-
-
-
- org.apache.storm
- storm-core
- ${storm.version}
-
-
- org.apache.storm
- storm-kafka-client
- ${storm.version}
-
-
- org.apache.kafka
- kafka-clients
- ${kafka.version}
-
-
-
\ No newline at end of file
diff --git a/code/Storm/storm-kafka-integration/src/main/java/com/heibaiying/kafka/write/WritingToKafkaApp.java b/code/Storm/storm-kafka-integration/src/main/java/com/heibaiying/kafka/write/WritingToKafkaApp.java
index 281824b..56814e3 100644
--- a/code/Storm/storm-kafka-integration/src/main/java/com/heibaiying/kafka/write/WritingToKafkaApp.java
+++ b/code/Storm/storm-kafka-integration/src/main/java/com/heibaiying/kafka/write/WritingToKafkaApp.java
@@ -14,7 +14,7 @@ import org.apache.storm.topology.TopologyBuilder;
import java.util.Properties;
/**
- * 写入数据到Kafka的特定主题中
+ * 写入数据到Kafka中
*/
public class WritingToKafkaApp {
diff --git a/code/Storm/storm-redis-integration/pom.xml b/code/Storm/storm-redis-integration/pom.xml
index 52ee5b8..c3c91dd 100644
--- a/code/Storm/storm-redis-integration/pom.xml
+++ b/code/Storm/storm-redis-integration/pom.xml
@@ -9,10 +9,23 @@
1.0
- UTF-8
1.2.2
+
+
+ org.apache.storm
+ storm-core
+ ${storm.version}
+
+
+ org.apache.storm
+ storm-redis
+ ${storm.version}
+
+
+
+
@@ -73,19 +86,4 @@
-
-
-
- org.apache.storm
- storm-core
- ${storm.version}
-
-
- org.apache.storm
- storm-redis
- ${storm.version}
-
-
-
-
\ No newline at end of file
diff --git a/code/Storm/storm-redis-integration/src/main/java/com/heibaiying/component/WordCountStoreMapper.java b/code/Storm/storm-redis-integration/src/main/java/com/heibaiying/component/WordCountStoreMapper.java
index 6343e9f..d75e568 100644
--- a/code/Storm/storm-redis-integration/src/main/java/com/heibaiying/component/WordCountStoreMapper.java
+++ b/code/Storm/storm-redis-integration/src/main/java/com/heibaiying/component/WordCountStoreMapper.java
@@ -5,7 +5,7 @@ import org.apache.storm.redis.common.mapper.RedisStoreMapper;
import org.apache.storm.tuple.ITuple;
/**
- * 定义流数据与Redis中数据的映射关系
+ * 定义tuple与Redis中数据的映射关系
*/
public class WordCountStoreMapper implements RedisStoreMapper {
private RedisDataTypeDescription description;
diff --git a/notes/Storm多种打包方式对比分析.md b/notes/Storm多种打包方式对比分析.md
new file mode 100644
index 0000000..c74e8ed
--- /dev/null
+++ b/notes/Storm多种打包方式对比分析.md
@@ -0,0 +1,309 @@
+# Storm多种打包方式对比分析
+
+## 一、简介
+
+在将Storm Topology提交到服务器集群进行运行时,需要先将项目进行打包,本文主要对比分析各种打包方式,并将打包过程中需要注意的事项进行说明。主要打包方式有以下三种:
+
++ 第一种:不加任何插件,直接使用mvn package打包;
++ 第二种:使用maven-assembly-plugin插件进行打包;
++ 第三种:使用maven-shade-plugin进行打包。
+
+以下分别进行详细的说明。
+
+
+
+## 二、mvn package
+
+### 2.1 mvn package的局限
+
+不在POM中配置任何插件,直接使用`mvn package`进行项目打包,这对于没有使用外部依赖包的项目是可行的。
+
+但如果项目中使用了第三方JAR包,就会出现问题,因为`mvn package`打包后的JAR中是不含有依赖包的,如果此时你提交到服务器上运行,就会出现找不到第三方依赖的异常。
+
+如果你想采用这种方式进行打包,但是又使用了第三方JAR包,有没有解决办法?答案是有的,这一点在官方文档中[Command Line Client](http://storm.apache.org/releases/2.0.0-SNAPSHOT/Command-line-client.html)这部分有所讲解,主要解决办法如下。
+
+### 2.2 解决办法
+
+在使用`storm jar topology-jar-path class ...`提交Topology的时候,需要指定第三方依赖时候,可以进行如下操作:
+
++ 如果第三方JAR包在本地,可以使用`--jars`指定;
++ 如果第三方JAR包在远程中央仓库,可以使用`--artifacts` 指定,此时如果想要排除某些依赖,可以使用 `^` 符号;
++ 如果第三方JAR包在其他仓库,还需要使用 `--artifactRepositories`指明仓库地址,库名和地址使用 `^` 符号分隔。
+
+以下是包含上面三种情况的一个例子:
+
+```shell
+./bin/storm jar example/storm-starter/storm-starter-topologies-*.jar org.apache.storm.starter.RollingTopWords blobstore-remote2 remote --jars "./external/storm-redis/storm-redis-1.1.0.jar,./external/storm-kafka/storm-kafka-1.1.0.jar" --artifacts "redis.clients:jedis:2.9.0,org.apache.kafka:kafka_2.10:0.8.2.2^org.slf4j:slf4j-log4j12" --artifactRepositories "jboss-repository^http://repository.jboss.com/maven2,HDPRepo^http://repo.hortonworks.com/content/groups/public/"
+```
+
+
+
+## 三、maven-assembly-plugin插件
+
+### 3.1 官方文档说明
+
+maven-assembly-plugin是官方文档中介绍的打包方法,以下表述来源于官方文档:[Running Topologies on a Production Cluster](http://storm.apache.org/releases/2.0.0-SNAPSHOT/Running-topologies-on-a-production-cluster.html)
+
+> If you're using Maven, the [Maven Assembly Plugin](http://maven.apache.org/plugins/maven-assembly-plugin/) can do the packaging for you. Just add this to your pom.xml:
+>
+> ```xml
+>
+> maven-assembly-plugin
+>
+>
+> jar-with-dependencies
+>
+>
+>
+> com.path.to.main.Class
+>
+>
+>
+>
+> ```
+>
+> Then run mvn assembly:assembly to get an appropriately packaged jar. Make sure you [exclude](http://maven.apache.org/plugins/maven-assembly-plugin/examples/single/including-and-excluding-artifacts.html) the Storm jars since the cluster already has Storm on the classpath.
+
+主要是两点:
+
+- 使用maven-assembly-plugin进行打包,因为maven-assembly-plugin会把所有的依赖一并打包到最后的JAR中;
+- 排除掉Storm集群环境中已经提供的Storm jars。
+
+maven-assembly-plugin的使用非常简单,只需要在POM.xml中引入即可,并且在\标签指定打包格式为`jar-with-dependencies`。那么就剩下一个问题:如何排除Storm jars ?
+
+### 3.2 排除Storm jars
+
+这里说明一下,`jar-with-dependencies`是Maven官方内置的一种打包格式,Maven官方文档[Pre-defined Descriptor Files](http://maven.apache.org/plugins/maven-assembly-plugin/descriptor-refs.html)中有所说明:
+
+
+
+如果你想排除某个依赖,这里以排除`storm-core`为例,你可以在`jar-with-dependencies`的XML上进行修改。
+
+```xml
+
+
+ jar-with-dependencies
+
+
+
+ jar
+
+
+ false
+
+
+ /
+ true
+ true
+ runtime
+
+
+ org.apache.storm:storm-core
+
+
+
+
+```
+
+### 3.3 最终配置
+
+采用maven-assembly-plugin进行打包时候,最终的配置应该如下:
+
+#### 1.引入插件
+
+在POM.xml中引入插件,并指定打包格式的配置文件`assembly.xml`(名称可自定义):
+
+```xml
+
+
+
+ maven-assembly-plugin
+
+
+ src/main/resources/assembly.xml
+
+
+
+ com.heibaiying.wordcount.ClusterWordCountApp
+
+
+
+
+
+
+```
+
+assembly.xml文件内容如下:
+
+```xml
+
+
+ jar-with-dependencies
+
+
+
+ jar
+
+
+ false
+
+
+ /
+ true
+ true
+ runtime
+
+
+ org.apache.storm:storm-core
+
+
+
+
+```
+
+>在配置文件中不仅可以排除依赖,还可以排除指定的文件,更多的配置规则可以参考官方文档:[Descriptor Format](http://maven.apache.org/plugins/maven-assembly-plugin/assembly.html#)
+
+#### 2. 打包命令
+
+采用maven-assembly-plugin进行打包时命令如下:
+
+```shell
+# mvn assembly:assembly
+```
+
+打包后会同时生成两个JAR包,其中后缀为`jar-with-dependencies`是含有第三方依赖的JAR包,后缀是由`assembly.xml`中``标签指定的,可以自定义修改。提交该JAR到集群环境即可直接使用。
+
+
+
+
+
+## 四、maven-shade-plugin插件
+
+### 4.1 官方文档说明
+
+第三种方式是使用maven-shade-plugin,既然已经有了maven-assembly-plugin,为什么还需要maven-shade-plugin,这一点在官方文档中也是有所说明的,来自于官方对HDFS整合讲解的章节[Storm HDFS Integration](http://storm.apache.org/releases/2.0.0-SNAPSHOT/storm-hdfs.html),原文如下:
+
+>When packaging your topology, it's important that you use the [maven-shade-plugin](http://storm.apache.org/releases/2.0.0-SNAPSHOT/storm-hdfs.html) as opposed to the [maven-assembly-plugin](http://storm.apache.org/releases/2.0.0-SNAPSHOT/storm-hdfs.html).
+>
+>The shade plugin provides facilities for merging JAR manifest entries, which the hadoop client leverages for URL scheme resolution.
+>
+>If you experience errors such as the following:
+>
+>```
+>java.lang.RuntimeException: Error preparing HdfsBolt: No FileSystem for scheme: hdfs
+>```
+>
+>it's an indication that your topology jar file isn't packaged properly.
+>
+>If you are using maven to create your topology jar, you should use the following `maven-shade-plugin` configuration to create your topology jar。
+
+这里第一句就说的比较清晰,在集成HDFS时候,你必须使用maven-shade-plugin来代替maven-assembly-plugin,否则会抛出
+
+RuntimeException异常。
+
+采用maven-shade-plugin有很多好处,比如你的工程依赖很多的JAR包,而被依赖的JAR又会依赖其他的JAR包,这样,当工程中依赖到不同的版本的 JAR时,并且JAR中具有相同名称的资源文件时,shade插件会尝试将所有资源文件打包在一起时,而不是和assembly一样执行覆盖操作。
+
+### 4.2 配置
+
+配置示例如下:
+
+```xml
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+ true
+
+
+ *:*
+
+ META-INF/*.SF
+ META-INF/*.sf
+ META-INF/*.DSA
+ META-INF/*.dsa
+ META-INF/*.RSA
+ META-INF/*.rsa
+ META-INF/*.EC
+ META-INF/*.ec
+ META-INF/MSFTSIG.SF
+ META-INF/MSFTSIG.RSA
+
+
+
+
+
+ org.apache.storm:storm-core
+
+
+
+
+
+ package
+
+ shade
+
+
+
+
+
+
+
+
+
+
+
+```
+
+配置说明:
+
+有些jar包生成时,会使用jarsigner生成文件签名(完成性校验),分为两个文件存放在META-INF目录下。
+
++ a signature file, with a .SF extension;
++ a signature block file, with a .DSA, .RSA, or .EC extension;
+
+如果某些包的存在重复引用,这可能会导致在打包时候出现`Invalid signature file digest for Manifest main attributes`异常,所以在配置中排除这些文件。
+
+### 4.3 打包命令
+
+使用maven-shade-plugin进行打包的时候,打包命令和普通的一样:
+
+```shell
+# mvn package
+```
+
+打包后会生成两个JAR包,提交到服务器集群时使用非original开头的JAR.
+
+
+
+## 五、结论
+
+通过以上三种打包方式的详细介绍,这里给出最后的结论:**建议使用maven-shade-plugin插件进行打包**,因为其通用性最强,操作最简单,并且storm官方Github中给的所有[examples](https://github.com/apache/storm/tree/master/examples)都是采用的第三种打包方式。
+
+
+
+## 六、打包注意事项
+
+无论采用任何打包方式,都必须排除集群环境中已经提供的storm jars。这里比较典型的是storm-core,其在安装目录的lib目录下已经存在。
+
+
+
+
+
+如果你不排除storm-core,通常会抛出下面的异常:
+
+```properties
+Caused by: java.lang.RuntimeException: java.io.IOException: Found multiple defaults.yaml resources. You're probably bundling the Storm jars with your topology jar. [jar:file:/usr/app/apache-storm-1.2.2/lib/storm-core-1.2.2.jar!/defaults.yaml, jar:file:/usr/appjar/storm-hdfs-integration-1.0.jar!/defaults.yaml]
+ at org.apache.storm.utils.Utils.findAndReadConfigFile(Utils.java:384)
+ at org.apache.storm.utils.Utils.readDefaultConfig(Utils.java:428)
+ at org.apache.storm.utils.Utils.readStormConfig(Utils.java:464)
+ at org.apache.storm.utils.Utils.(Utils.java:178)
+ ... 39 more
+```
+
+
\ No newline at end of file
diff --git a/notes/Storm编程模型详解.md b/notes/Storm编程模型详解.md
index 0b74a3a..3c01401 100644
--- a/notes/Storm编程模型详解.md
+++ b/notes/Storm编程模型详解.md
@@ -471,7 +471,7 @@ storm kill ClusterWordCountApp -w 3
### 1. mvn package的局限性
-上面我们直接使用`mvn package`进行项目打包,这对于没有使用外部依赖包的项目是可行的。但如果项目中使用了第三方JAR包,就会出现问题,因为`package`打包后的JAR中是不含有依赖包的,如果此时你提交到服务器上运行,就会出现找不到第三方依赖的异常。
+上面我们没有在POM中配置任何插件,直接使用`mvn package`进行项目打包,这对于没有使用外部依赖包的项目是可行的。但如果项目中使用了第三方JAR包,就会出现问题,因为`package`打包后的JAR中是不含有依赖包的,如果此时你提交到服务器上运行,就会出现找不到第三方依赖的异常。
这时候可能大家会有疑惑,在我们的项目中不是使用了`storm-core`这个依赖吗?其实上面之所以我们能运行成功,是因为在Storm的集群环境中提供了这个JAR包,在安装目录的lib目录下:
diff --git a/notes/Storm集成HBase和HDFS.md b/notes/Storm集成HBase和HDFS.md
new file mode 100644
index 0000000..d424740
--- /dev/null
+++ b/notes/Storm集成HBase和HDFS.md
@@ -0,0 +1,472 @@
+# Storm集成HDFS和HBase
+
+## 一、Storm集成HDFS
+
+### 1.1 项目结构
+
+> 本用例源码下载地址:[storm-hdfs-integration](https://github.com/heibaiying/BigData-Notes/tree/master/code/Storm/storm-hdfs-integration)
+
+### 1.2 项目主要依赖
+
+项目主要依赖如下,有两个需要注意:
+
++ 这里由于我服务器上安装的是CDH版本的Hadoop,在导入依赖时引入的也是CDH版本的依赖,需要使用``标签指定CDH的仓库地址;
++ `hadoop-common`、`hadoop-client`、`hadoop-hdfs`均需要排除`slf4j-log4j12`依赖,原因是`storm-core`中已经有该依赖,不排除的话有JAR包冲突的风险;
+
+```xml
+
+ 1.2.2
+
+
+
+
+ cloudera
+ https://repository.cloudera.com/artifactory/cloudera-repos/
+
+
+
+
+
+ org.apache.storm
+ storm-core
+ ${storm.version}
+
+
+
+ org.apache.storm
+ storm-hdfs
+ ${storm.version}
+
+
+ org.apache.hadoop
+ hadoop-common
+ 2.6.0-cdh5.15.2
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+ org.apache.hadoop
+ hadoop-client
+ 2.6.0-cdh5.15.2
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+ org.apache.hadoop
+ hadoop-hdfs
+ 2.6.0-cdh5.15.2
+
+
+ org.slf4j
+ slf4j-log4j12
+
+
+
+
+```
+
+### 1.3 DataSourceSpout
+
+```java
+/**
+ * 产生词频样本的数据源
+ */
+public class DataSourceSpout extends BaseRichSpout {
+
+ private List list = Arrays.asList("Spark", "Hadoop", "HBase", "Storm", "Flink", "Hive");
+
+ private SpoutOutputCollector spoutOutputCollector;
+
+ @Override
+ public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
+ this.spoutOutputCollector = spoutOutputCollector;
+ }
+
+ @Override
+ public void nextTuple() {
+ // 模拟产生数据
+ String lineData = productData();
+ spoutOutputCollector.emit(new Values(lineData));
+ Utils.sleep(1000);
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
+ outputFieldsDeclarer.declare(new Fields("line"));
+ }
+
+
+ /**
+ * 模拟数据
+ */
+ private String productData() {
+ Collections.shuffle(list);
+ Random random = new Random();
+ int endIndex = random.nextInt(list.size()) % (list.size()) + 1;
+ return StringUtils.join(list.toArray(), "\t", 0, endIndex);
+ }
+
+}
+```
+
+产生的模拟数据格式如下:
+
+```properties
+Spark HBase
+Hive Flink Storm Hadoop HBase Spark
+Flink
+HBase Storm
+HBase Hadoop Hive Flink
+HBase Flink Hive Storm
+Hive Flink Hadoop
+HBase Hive
+Hadoop Spark HBase Storm
+```
+
+### 1.4 将数据存储到HDFS
+
+这里HDFS的地址和数据存储路径均使用了硬编码,在实际开发中可以通过外部传参指定,这样程序更为灵活。
+
+```java
+public class DataToHdfsApp {
+
+ private static final String DATA_SOURCE_SPOUT = "dataSourceSpout";
+ private static final String HDFS_BOLT = "hdfsBolt";
+
+ public static void main(String[] args) {
+
+ // 指定Hadoop的用户名 如果不指定,则在HDFS创建目录时候有可能抛出无权限的异常(RemoteException: Permission denied)
+ System.setProperty("HADOOP_USER_NAME", "root");
+
+ // 定义输出字段(Field)之间的分隔符
+ RecordFormat format = new DelimitedRecordFormat()
+ .withFieldDelimiter("|");
+
+ // 同步策略: 每100个tuples之后就会把数据从缓存刷新到HDFS中
+ SyncPolicy syncPolicy = new CountSyncPolicy(100);
+
+ // 文件策略: 每个文件大小上限1M,超过限定时,创建新文件并继续写入
+ FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(1.0f, Units.MB);
+
+ // 定义存储路径
+ FileNameFormat fileNameFormat = new DefaultFileNameFormat()
+ .withPath("/storm-hdfs/");
+
+ // 定义HdfsBolt
+ HdfsBolt hdfsBolt = new HdfsBolt()
+ .withFsUrl("hdfs://hadoop001:8020")
+ .withFileNameFormat(fileNameFormat)
+ .withRecordFormat(format)
+ .withRotationPolicy(rotationPolicy)
+ .withSyncPolicy(syncPolicy);
+
+
+ // 构建Topology
+ TopologyBuilder builder = new TopologyBuilder();
+ builder.setSpout(DATA_SOURCE_SPOUT, new DataSourceSpout());
+ // save to HDFS
+ builder.setBolt(HDFS_BOLT, hdfsBolt, 1).shuffleGrouping(DATA_SOURCE_SPOUT);
+
+
+ // 如果外部传参cluster则代表线上环境启动,否则代表本地启动
+ if (args.length > 0 && args[0].equals("cluster")) {
+ try {
+ StormSubmitter.submitTopology("ClusterDataToHdfsApp", new Config(), builder.createTopology());
+ } catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
+ e.printStackTrace();
+ }
+ } else {
+ LocalCluster cluster = new LocalCluster();
+ cluster.submitTopology("LocalDataToHdfsApp",
+ new Config(), builder.createTopology());
+ }
+ }
+}
+```
+
+### 1.5 启动测试
+
+可以用直接使用本地模式运行,也可以打包后提交到服务器集群运行。本仓库提供的源码默认采用`maven-shade-plugin`进行打包,打包命令如下:
+
+```shell
+# mvn clean package -D maven.test.skip=true
+```
+
+运行后,数据会存储到HDFS的`/storm-hdfs`目录下。使用以下命令可以查看目录内容:
+
+```shell
+# 查看目录内容
+hadoop fs -ls /storm-hdfs
+# 监听文内容变化
+hadoop fs -tail -f /strom-hdfs/文件名
+```
+
+
+
+
+
+
+
+## 二、Storm集成HBase
+
+### 2.1 项目结构
+
+本用例源码下载地址:[storm-hbase-integration](https://github.com/heibaiying/BigData-Notes/tree/master/code/Storm/storm-hbase-integration)
+
+### 2.2 项目主要依赖
+
+```xml
+
+ 1.2.2
+
+
+
+
+
+ org.apache.storm
+ storm-core
+ ${storm.version}
+
+
+
+ org.apache.storm
+ storm-hbase
+ ${storm.version}
+
+
+```
+
+### 2.3 DataSourceSpout
+
+```java
+/**
+ * 产生词频样本的数据源
+ */
+public class DataSourceSpout extends BaseRichSpout {
+
+ private List list = Arrays.asList("Spark", "Hadoop", "HBase", "Storm", "Flink", "Hive");
+
+ private SpoutOutputCollector spoutOutputCollector;
+
+ @Override
+ public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
+ this.spoutOutputCollector = spoutOutputCollector;
+ }
+
+ @Override
+ public void nextTuple() {
+ // 模拟产生数据
+ String lineData = productData();
+ spoutOutputCollector.emit(new Values(lineData));
+ Utils.sleep(1000);
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
+ outputFieldsDeclarer.declare(new Fields("line"));
+ }
+
+
+ /**
+ * 模拟数据
+ */
+ private String productData() {
+ Collections.shuffle(list);
+ Random random = new Random();
+ int endIndex = random.nextInt(list.size()) % (list.size()) + 1;
+ return StringUtils.join(list.toArray(), "\t", 0, endIndex);
+ }
+
+}
+```
+
+产生的模拟数据格式如下:
+
+```properties
+Spark HBase
+Hive Flink Storm Hadoop HBase Spark
+Flink
+HBase Storm
+HBase Hadoop Hive Flink
+HBase Flink Hive Storm
+Hive Flink Hadoop
+HBase Hive
+Hadoop Spark HBase Storm
+```
+
+
+
+### 2.4 SplitBolt
+
+```java
+/**
+ * 将每行数据按照指定分隔符进行拆分
+ */
+public class SplitBolt extends BaseRichBolt {
+
+ private OutputCollector collector;
+
+ @Override
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
+ this.collector = collector;
+ }
+
+ @Override
+ public void execute(Tuple input) {
+ String line = input.getStringByField("line");
+ String[] words = line.split("\t");
+ for (String word : words) {
+ collector.emit(tuple(word, 1));
+ }
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer declarer) {
+ declarer.declare(new Fields("word", "count"));
+ }
+}
+```
+
+### 2.5 CountBolt
+
+```java
+/**
+ * 进行词频统计
+ */
+public class CountBolt extends BaseRichBolt {
+
+ private Map counts = new HashMap<>();
+
+ private OutputCollector collector;
+
+
+ @Override
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
+ this.collector=collector;
+ }
+
+ @Override
+ public void execute(Tuple input) {
+ String word = input.getStringByField("word");
+ Integer count = counts.get(word);
+ if (count == null) {
+ count = 0;
+ }
+ count++;
+ counts.put(word, count);
+ // 输出
+ collector.emit(new Values(word, String.valueOf(count)));
+
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer declarer) {
+ declarer.declare(new Fields("word", "count"));
+ }
+}
+```
+
+### 2.6 WordCountToHBaseApp
+
+```java
+/**
+ * 进行词频统计 并将统计结果存储到HBase中
+ */
+public class WordCountToHBaseApp {
+
+ private static final String DATA_SOURCE_SPOUT = "dataSourceSpout";
+ private static final String SPLIT_BOLT = "splitBolt";
+ private static final String COUNT_BOLT = "countBolt";
+ private static final String HBASE_BOLT = "hbaseBolt";
+
+ public static void main(String[] args) {
+
+ // storm的配置
+ Config config = new Config();
+
+ // HBase的配置
+ Map hbConf = new HashMap<>();
+ hbConf.put("hbase.rootdir", "hdfs://hadoop001:8020/hbase");
+ hbConf.put("hbase.zookeeper.quorum", "hadoop001:2181");
+
+ // 将HBase的配置传入Storm的配置中
+ config.put("hbase.conf", hbConf);
+
+ // 定义流数据与HBase中数据的映射
+ SimpleHBaseMapper mapper = new SimpleHBaseMapper()
+ .withRowKeyField("word")
+ .withColumnFields(new Fields("word","count"))
+ .withColumnFamily("info");
+
+ /*
+ * 给HBaseBolt传入表名、数据映射关系、和HBase的配置信息
+ * 表需要预先创建: create 'WordCount','info'
+ */
+ HBaseBolt hbase = new HBaseBolt("WordCount", mapper)
+ .withConfigKey("hbase.conf");
+
+ // 构建Topology
+ TopologyBuilder builder = new TopologyBuilder();
+ builder.setSpout(DATA_SOURCE_SPOUT, new DataSourceSpout(),1);
+ // split
+ builder.setBolt(SPLIT_BOLT, new SplitBolt(), 1).shuffleGrouping(DATA_SOURCE_SPOUT);
+ // count
+ builder.setBolt(COUNT_BOLT, new CountBolt(),1).shuffleGrouping(SPLIT_BOLT);
+ // save to HBase
+ builder.setBolt(HBASE_BOLT, hbase, 1).shuffleGrouping(COUNT_BOLT);
+
+
+ // 如果外部传参cluster则代表线上环境启动,否则代表本地启动
+ if (args.length > 0 && args[0].equals("cluster")) {
+ try {
+ StormSubmitter.submitTopology("ClusterWordCountToRedisApp", config, builder.createTopology());
+ } catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
+ e.printStackTrace();
+ }
+ } else {
+ LocalCluster cluster = new LocalCluster();
+ cluster.submitTopology("LocalWordCountToRedisApp",
+ config, builder.createTopology());
+ }
+ }
+}
+```
+
+### 2.7 启动测试
+
+可以用直接使用本地模式运行,也可以打包后提交到服务器集群运行。本仓库提供的源码默认采用`maven-shade-plugin`进行打包,打包命令如下:
+
+```shell
+# mvn clean package -D maven.test.skip=true
+```
+
+运行后,数据会存储到HBase的`WordCount`表中。使用以下命令查看表的内容:
+
+```shell
+hbase > scan 'WordCount'
+```
+
+
+
+
+
+### 2.8 withCounterFields
+
+在上面的用例中我们是手动编码来实现词频统计,并将最后的结果存储到HBase中。其实也可以在构建`SimpleHBaseMapper`的时候通过`withCounterFields`指定count字段,被指定的字段会自动进行累加操作,这样也可以实现词频统计。需要注意的是withCounterFields指定的字段必须是Long类型,不能是String类型。
+
+```java
+SimpleHBaseMapper mapper = new SimpleHBaseMapper()
+ .withRowKeyField("word")
+ .withColumnFields(new Fields("word"))
+ .withCounterFields(new Fields("count"))
+ .withColumnFamily("cf");
+```
+
diff --git a/notes/Storm集成Kakfa.md b/notes/Storm集成Kakfa.md
new file mode 100644
index 0000000..20d7ed0
--- /dev/null
+++ b/notes/Storm集成Kakfa.md
@@ -0,0 +1,344 @@
+# Storm集成Kafka
+
+## 一、整合说明
+
+Storm官方对Kafka的整合分为两个版本,官方说明文档分别如下:
+
++ [Storm Kafka Integration](http://storm.apache.org/releases/2.0.0-SNAPSHOT/storm-kafka.html) : 主要是针对0.8.x版本的Kafka提供整合支持;
++ [Storm Kafka Integration (0.10.x+)]() : 包含Kafka 新版本的 consumer API,主要对Kafka 0.10.x +提供整合支持。
+
+这里我服务端安装的Kafka版本为2.2.0(Released Mar 22, 2019) ,按照官方0.10.x+的整合文档进行整合,不适用于0.8.x版本的Kafka。
+
+## 二、写入数据到Kafka
+
+### 2.1 项目结构
+
+
+
+### 2.2 项目主要依赖
+
+```xml
+
+ 1.2.2
+ 2.2.0
+
+
+
+
+ org.apache.storm
+ storm-core
+ ${storm.version}
+
+
+ org.apache.storm
+ storm-kafka-client
+ ${storm.version}
+
+
+ org.apache.kafka
+ kafka-clients
+ ${kafka.version}
+
+
+```
+
+### 2.3 DataSourceSpout
+
+```java
+/**
+ * 产生词频样本的数据源
+ */
+public class DataSourceSpout extends BaseRichSpout {
+
+ private List list = Arrays.asList("Spark", "Hadoop", "HBase", "Storm", "Flink", "Hive");
+
+ private SpoutOutputCollector spoutOutputCollector;
+
+ @Override
+ public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
+ this.spoutOutputCollector = spoutOutputCollector;
+ }
+
+ @Override
+ public void nextTuple() {
+ // 模拟产生数据
+ String lineData = productData();
+ spoutOutputCollector.emit(new Values(lineData));
+ Utils.sleep(1000);
+ }
+
+ @Override
+ public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
+ outputFieldsDeclarer.declare(new Fields("line"));
+ }
+
+
+ /**
+ * 模拟数据
+ */
+ private String productData() {
+ Collections.shuffle(list);
+ Random random = new Random();
+ int endIndex = random.nextInt(list.size()) % (list.size()) + 1;
+ return StringUtils.join(list.toArray(), "\t", 0, endIndex);
+ }
+
+}
+```
+
+产生的模拟数据格式如下:
+
+```properties
+Spark HBase
+Hive Flink Storm Hadoop HBase Spark
+Flink
+HBase Storm
+HBase Hadoop Hive Flink
+HBase Flink Hive Storm
+Hive Flink Hadoop
+HBase Hive
+Hadoop Spark HBase Storm
+```
+
+### 2.4 WritingToKafkaApp
+
+```java
+/**
+ * 写入数据到Kafka中
+ */
+public class WritingToKafkaApp {
+
+ private static final String BOOTSTRAP_SERVERS = "hadoop001:9092";
+ private static final String TOPIC_NAME = "storm-topic";
+
+ public static void main(String[] args) {
+
+
+ TopologyBuilder builder = new TopologyBuilder();
+
+ // 定义Kafka生产者属性
+ Properties props = new Properties();
+ /*
+ * 指定broker的地址清单,清单里不需要包含所有的broker地址,生产者会从给定的broker里查找其他broker的信息。
+ * 不过建议至少要提供两个broker的信息作为容错。
+ */
+ props.put("bootstrap.servers", BOOTSTRAP_SERVERS);
+ /*
+ * acks 参数指定了必须要有多少个分区副本收到消息,生产者才会认为消息写入是成功的。
+ * acks=0 : 生产者在成功写入消息之前不会等待任何来自服务器的响应。
+ * acks=1 : 只要集群的首领节点收到消息,生产者就会收到一个来自服务器成功响应。
+ * acks=all : 只有当所有参与复制的节点全部收到消息时,生产者才会收到一个来自服务器的成功响应。
+ */
+ props.put("acks", "1");
+ props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
+ props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
+
+ KafkaBolt bolt = new KafkaBolt()
+ .withProducerProperties(props)
+ .withTopicSelector(new DefaultTopicSelector(TOPIC_NAME))
+ .withTupleToKafkaMapper(new FieldNameBasedTupleToKafkaMapper<>());
+
+ builder.setSpout("sourceSpout", new DataSourceSpout(), 1);
+ builder.setBolt("kafkaBolt", bolt, 1).shuffleGrouping("sourceSpout");
+
+
+ if (args.length > 0 && args[0].equals("cluster")) {
+ try {
+ StormSubmitter.submitTopology("ClusterWritingToKafkaApp", new Config(), builder.createTopology());
+ } catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
+ e.printStackTrace();
+ }
+ } else {
+ LocalCluster cluster = new LocalCluster();
+ cluster.submitTopology("LocalWritingToKafkaApp",
+ new Config(), builder.createTopology());
+ }
+ }
+}
+```
+
+### 2.5 测试准备工作
+
+进行测试前需要启动Kakfa。
+
+#### 1. 启动Kakfa
+
+Kafka的运行依赖于zookeeper,需要预先启动,可以启动Kafka内置的zookeeper,也可以启动自己安装的。
+
+```shell
+# zookeeper启动命令
+bin/zkServer.sh start
+
+# 内置zookeeper启动命令
+bin/zookeeper-server-start.sh config/zookeeper.properties
+```
+
+启动单节点kafka:
+
+```shell
+# bin/kafka-server-start.sh config/server.properties
+```
+
+#### 2. 创建topic
+
+```shell
+# 创建用于测试主题
+bin/kafka-topics.sh --create --bootstrap-server hadoop001:9092 --replication-factor 1 --partitions 1 --topic storm-topic
+
+# 查看所有主题
+ bin/kafka-topics.sh --list --bootstrap-server hadoop001:9092
+```
+
+#### 3. 启动一个消费者用于观察写入情况
+
+```shell
+# bin/kafka-console-consumer.sh --bootstrap-server hadoop001:9092 --topic storm-topic --from-beginning
+```
+
+### 2.6 测试
+
+可以用直接使用本地模式运行,也可以打包后提交到服务器集群运行。本仓库提供的源码默认采用`maven-shade-plugin`进行打包,打包命令如下:
+
+```shell
+# mvn clean package -D maven.test.skip=true
+```
+
+启动后,消费者监听情况如下:
+
+
+
+
+
+## 三、从Kafka中读取数据
+
+### 3.1 项目结构
+
+
+
+### 3.2 ReadingFromKafkaApp
+
+```java
+/**
+ * 从Kafka中读取数据
+ */
+public class ReadingFromKafkaApp {
+
+ private static final String BOOTSTRAP_SERVERS = "hadoop001:9092";
+ private static final String TOPIC_NAME = "storm-topic";
+
+ public static void main(String[] args) {
+
+ final TopologyBuilder builder = new TopologyBuilder();
+ builder.setSpout("kafka_spout", new KafkaSpout<>(getKafkaSpoutConfig(BOOTSTRAP_SERVERS, TOPIC_NAME)), 1);
+ builder.setBolt("bolt", new LogConsoleBolt()).shuffleGrouping("kafka_spout");
+
+ // 如果外部传参cluster则代表线上环境启动,否则代表本地启动
+ if (args.length > 0 && args[0].equals("cluster")) {
+ try {
+ StormSubmitter.submitTopology("ClusterReadingFromKafkaApp", new Config(), builder.createTopology());
+ } catch (AlreadyAliveException | InvalidTopologyException | AuthorizationException e) {
+ e.printStackTrace();
+ }
+ } else {
+ LocalCluster cluster = new LocalCluster();
+ cluster.submitTopology("LocalReadingFromKafkaApp",
+ new Config(), builder.createTopology());
+ }
+ }
+
+ private static KafkaSpoutConfig getKafkaSpoutConfig(String bootstrapServers, String topic) {
+ return KafkaSpoutConfig.builder(bootstrapServers, topic)
+ // 除了分组ID,以下配置都是可选的。分组ID必须指定,否则会抛出InvalidGroupIdException异常
+ .setProp(ConsumerConfig.GROUP_ID_CONFIG, "kafkaSpoutTestGroup")
+ // 定义重试策略
+ .setRetry(getRetryService())
+ // 定时提交偏移量的时间间隔,默认是15s
+ .setOffsetCommitPeriodMs(10_000)
+ .build();
+ }
+
+ // 定义重试策略
+ private static KafkaSpoutRetryService getRetryService() {
+ return new KafkaSpoutRetryExponentialBackoff(TimeInterval.microSeconds(500),
+ TimeInterval.milliSeconds(2), Integer.MAX_VALUE, TimeInterval.seconds(10));
+ }
+}
+
+```
+
+### 3.3 LogConsoleBolt
+
+```java
+/**
+ * 打印从Kafka中获取的数据
+ */
+public class LogConsoleBolt extends BaseRichBolt {
+
+
+ private OutputCollector collector;
+
+ public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
+ this.collector=collector;
+ }
+
+ public void execute(Tuple input) {
+ try {
+ String value = input.getStringByField("value");
+ System.out.println("received from kafka : "+ value);
+ // 必须ack,否则会重复消费kafka中的消息
+ collector.ack(input);
+ }catch (Exception e){
+ e.printStackTrace();
+ collector.fail(input);
+ }
+
+ }
+
+ public void declareOutputFields(OutputFieldsDeclarer declarer) {
+
+ }
+}
+```
+
+这里从`value`字段中获取kafka输出的值数据。
+
+默认情况下,我们可以通过继承`RecordTranslator`接口定义了Kafka中Record与输出流之间的转换关系,可以在构建`KafkaSpoutConfig`的时候通过构造器或者`setRecordTranslator()`传入,并最后传递给具体的`KafkaSpout`。如果不指定的情况下,则默认使用内置的`DefaultRecordTranslator`,其源码如下,`FIELDS`中 定义了tuple中所有可用的字段:
+
+```java
+public class DefaultRecordTranslator implements RecordTranslator {
+ private static final long serialVersionUID = -5782462870112305750L;
+ public static final Fields FIELDS = new Fields("topic", "partition", "offset", "key", "value");
+ @Override
+ public List