diff --git a/README.md b/README.md index 440fff2..04d37c6 100644 --- a/README.md +++ b/README.md @@ -185,4 +185,4 @@ TODO ## :bookmark_tabs: 后 记 -[资料分享与工具推荐](https://github.com/heibaiying/BigData-Notes/blob/master/notes/资料分享与工具推荐.md) +[资料分享与开发工具推荐](https://github.com/heibaiying/BigData-Notes/blob/master/notes/资料分享与工具推荐.md) \ No newline at end of file diff --git a/code/spark/spark-core/file/emp.json b/code/spark/spark-core/file/emp.json deleted file mode 100644 index 03af1f5..0000000 --- a/code/spark/spark-core/file/emp.json +++ /dev/null @@ -1,14 +0,0 @@ -{"EMPNO": 7369,"ENAME": "SMITH","JOB": "CLERK","MGR": 7902,"HIREDATE": "1980-12-17 00:00:00","SAL": 800.00,"COMM": null,"DEPTNO": 20} -{"EMPNO": 7499,"ENAME": "ALLEN","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-02-20 00:00:00","SAL": 1600.00,"COMM": 300.00,"DEPTNO": 30} -{"EMPNO": 7521,"ENAME": "WARD","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-02-22 00:00:00","SAL": 1250.00,"COMM": 500.00,"DEPTNO": 30} -{"EMPNO": 7566,"ENAME": "JONES","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-04-02 00:00:00","SAL": 2975.00,"COMM": null,"DEPTNO": 20} -{"EMPNO": 7654,"ENAME": "MARTIN","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-09-28 00:00:00","SAL": 1250.00,"COMM": 1400.00,"DEPTNO": 30} -{"EMPNO": 7698,"ENAME": "BLAKE","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-05-01 00:00:00","SAL": 2850.00,"COMM": null,"DEPTNO": 30} -{"EMPNO": 7782,"ENAME": "CLARK","JOB": "MANAGER","MGR": 7839,"HIREDATE": "1981-06-09 00:00:00","SAL": 2450.00,"COMM": null,"DEPTNO": 10} -{"EMPNO": 7788,"ENAME": "SCOTT","JOB": "ANALYST","MGR": 7566,"HIREDATE": "1987-04-19 00:00:00","SAL": 1500.00,"COMM": null,"DEPTNO": 20} -{"EMPNO": 7839,"ENAME": "KING","JOB": "PRESIDENT","MGR": null,"HIREDATE": "1981-11-17 00:00:00","SAL": 5000.00,"COMM": null,"DEPTNO": 10} -{"EMPNO": 7844,"ENAME": "TURNER","JOB": "SALESMAN","MGR": 7698,"HIREDATE": "1981-09-08 00:00:00","SAL": 1500.00,"COMM": 0.00,"DEPTNO": 30} -{"EMPNO": 7876,"ENAME": "ADAMS","JOB": "CLERK","MGR": 7788,"HIREDATE": "1987-05-23 00:00:00","SAL": 1100.00,"COMM": null,"DEPTNO": 20} -{"EMPNO": 7900,"ENAME": "JAMES","JOB": "CLERK","MGR": 7698,"HIREDATE": "1981-12-03 00:00:00","SAL": 950.00,"COMM": null,"DEPTNO": 30} -{"EMPNO": 7902,"ENAME": "FORD","JOB": "ANALYST","MGR": 7566,"HIREDATE": "1981-12-03 00:00:00","SAL": 3000.00,"COMM": null,"DEPTNO": 20} -{"EMPNO": 7934,"ENAME": "MILLER","JOB": "CLERK","MGR": 7782,"HIREDATE": "1982-01-23 00:00:00","SAL": 1300.00,"COMM": null,"DEPTNO": 10} \ No newline at end of file diff --git a/code/spark/spark-core/pom.xml b/code/spark/spark-core/pom.xml deleted file mode 100644 index e8128f4..0000000 --- a/code/spark/spark-core/pom.xml +++ /dev/null @@ -1,59 +0,0 @@ - - - 4.0.0 - - com.heibaiying - spark-core - 1.0 - - - 2.12 - 2.4.0 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - 8 - 8 - - - - - - - - - - org.apache.spark - spark-core_${scala.version} - ${spark.version} - - - org.apache.spark - spark-sql_${scala.version} - ${spark.version} - - - - - junit - junit - 4.12 - - - - com.thoughtworks.paranamer - paranamer - 2.8 - - - - - \ No newline at end of file diff --git a/code/spark/spark-core/src/main/java/rdd/java/TransformationTest.java b/code/spark/spark-core/src/main/java/rdd/java/TransformationTest.java deleted file mode 100644 index 19a21bf..0000000 --- a/code/spark/spark-core/src/main/java/rdd/java/TransformationTest.java +++ /dev/null @@ -1,44 +0,0 @@ -package rdd.java; - -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; -import java.util.List; - - -public class TransformationTest { - - - private static JavaSparkContext sc = null; - - - @Before - public void prepare() { - SparkConf conf = new SparkConf().setAppName("TransformationTest").setMaster("local[2]"); - sc = new JavaSparkContext(conf); - } - - @Test - public void map() { - List list = Arrays.asList(3, 6, 9, 10, 12, 21); - /* - * 不要使用方法引用的形式 : System.out::println , 否则会抛出下面的异常: - * org.apache.spark.SparkException: Task not serializable - * Caused by: java.io.NotSerializableException: java.io.PrintStream - * 这是由于Spark程序中map、foreach等算子内部引用了类成员函数或变量时,需要该类所有成员都支持序列化, - * 如果该类某些成员变量不支持序列化,就会抛出上面的异常 - */ - sc.parallelize(list).map(x -> x * 10).foreach(x -> System.out.println(x)); - } - - - @After - public void destroy() { - sc.close(); - } - -} diff --git a/code/spark/spark-core/src/main/java/rdd/scala/SparkSqlApp.scala b/code/spark/spark-core/src/main/java/rdd/scala/SparkSqlApp.scala deleted file mode 100644 index 5bdfa93..0000000 --- a/code/spark/spark-core/src/main/java/rdd/scala/SparkSqlApp.scala +++ /dev/null @@ -1,27 +0,0 @@ -package rdd.scala - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions._ - - -object SparkSqlApp { - - // 测试方法 - def main(args: Array[String]): Unit = { - - val spark = SparkSession.builder().appName("aggregations").master("local[2]").getOrCreate() - - val df = spark.read.json("/usr/file/json/emp.json") - - import spark.implicits._ - - df.select($"ename").limit(5).show() - df.sort("sal").limit(3).show() - - df.orderBy(desc("sal")).limit(3).show() - - df.select("deptno").distinct().show() - - df.orderBy(desc("deptno"), asc("sal")).show(2) - } -} diff --git a/code/spark/spark-core/src/main/java/rdd/scala/TransformationTest.scala b/code/spark/spark-core/src/main/java/rdd/scala/TransformationTest.scala deleted file mode 100644 index 1b3f03e..0000000 --- a/code/spark/spark-core/src/main/java/rdd/scala/TransformationTest.scala +++ /dev/null @@ -1,201 +0,0 @@ -package rdd.scala - -import org.apache.spark.{SparkConf, SparkContext} -import org.junit.{After, Test} - -import scala.collection.mutable.ListBuffer - -class TransformationTest { - - val conf: SparkConf = new SparkConf().setAppName("TransformationTest").setMaster("local[2]") - val sc = new SparkContext(conf) - - - @Test - def map(): Unit = { - val list = List(1, 2, 3) - sc.parallelize(list).map(_ * 10).foreach(println) - } - - - @Test - def filter(): Unit = { - val list = List(3, 6, 9, 10, 12, 21) - sc.parallelize(list).filter(_ >= 10).foreach(println) - } - - - @Test - def flatMap(): Unit = { - val list = List(List(1, 2), List(3), List(), List(4, 5)) - sc.parallelize(list).flatMap(_.toList).map(_ * 10).foreach(println) - - val lines = List("spark flume spark", - "hadoop flume hive") - sc.parallelize(lines).flatMap(line => line.split(" ")). - map(word => (word, 1)).reduceByKey(_ + _).foreach(println) - - } - - - @Test - def mapPartitions(): Unit = { - val list = List(1, 2, 3, 4, 5, 6) - sc.parallelize(list, 3).mapPartitions(iterator => { - val buffer = new ListBuffer[Int] - while (iterator.hasNext) { - buffer.append(iterator.next() * 100) - } - buffer.toIterator - }).foreach(println) - } - - - @Test - def mapPartitionsWithIndex(): Unit = { - val list = List(1, 2, 3, 4, 5, 6) - sc.parallelize(list, 3).mapPartitionsWithIndex((index, iterator) => { - val buffer = new ListBuffer[String] - while (iterator.hasNext) { - buffer.append(index + "分区:" + iterator.next() * 100) - } - buffer.toIterator - }).foreach(println) - } - - - @Test - def sample(): Unit = { - val list = List(1, 2, 3, 4, 5, 6) - sc.parallelize(list).sample(withReplacement = false, fraction = 0.5).foreach(println) - } - - - @Test - def union(): Unit = { - val list1 = List(1, 2, 3) - val list2 = List(4, 5, 6) - sc.parallelize(list1).union(sc.parallelize(list2)).foreach(println) - } - - - @Test - def intersection(): Unit = { - val list1 = List(1, 2, 3, 4, 5) - val list2 = List(4, 5, 6) - sc.parallelize(list1).intersection(sc.parallelize(list2)).foreach(println) - } - - @Test - def distinct(): Unit = { - val list = List(1, 2, 2, 4, 4) - sc.parallelize(list).distinct().foreach(println) - } - - - @Test - def groupByKey(): Unit = { - val list = List(("hadoop", 2), ("spark", 3), ("spark", 5), ("storm", 6), ("hadoop", 2)) - sc.parallelize(list).groupByKey().map(x => (x._1, x._2.toList)).foreach(println) - } - - - @Test - def reduceByKey(): Unit = { - val list = List(("hadoop", 2), ("spark", 3), ("spark", 5), ("storm", 6), ("hadoop", 2)) - sc.parallelize(list).reduceByKey(_ + _).foreach(println) - } - - @Test - def aggregateByKey(): Unit = { - val list = List(("hadoop", 3), ("hadoop", 2), ("spark", 4), ("spark", 3), ("storm", 6), ("storm", 8)) - sc.parallelize(list, numSlices = 6).aggregateByKey(zeroValue = 0, numPartitions = 5)( - seqOp = math.max(_, _), - combOp = _ + _ - ).getNumPartitions - } - - - @Test - def sortBy(): Unit = { - val list01 = List((100, "hadoop"), (90, "spark"), (120, "storm")) - sc.parallelize(list01).sortByKey(ascending = false).foreach(println) - - val list02 = List(("hadoop", 100), ("spark", 90), ("storm", 120)) - sc.parallelize(list02).sortBy(x => x._2, ascending = false).foreach(println) - } - - - @Test - def join(): Unit = { - val list01 = List((1, "student01"), (2, "student02"), (3, "student03")) - val list02 = List((1, "teacher01"), (2, "teacher02"), (3, "teacher03")) - sc.parallelize(list01).join(sc.parallelize(list02)).foreach(println) - } - - - @Test - def cogroup(): Unit = { - val list01 = List((1, "a"), (1, "a"), (2, "b"), (3, "e")) - val list02 = List((1, "A"), (2, "B"), (3, "E")) - val list03 = List((1, "[ab]"), (2, "[bB]"), (3, "eE"), (3, "eE")) - sc.parallelize(list01).cogroup(sc.parallelize(list02), sc.parallelize(list03)).foreach(println) - } - - - @Test - def cartesian(): Unit = { - val list1 = List("A", "B", "C") - val list2 = List(1, 2, 3) - sc.parallelize(list1).cartesian(sc.parallelize(list2)).foreach(println) - } - - - @Test - def reduce(): Unit = { - val list = List(1, 2, 3, 4, 5) - sc.parallelize(list).reduce((x, y) => x + y) - sc.parallelize(list).reduce(_ + _) - } - - // 继承Ordering[T],实现自定义比较器 - class CustomOrdering extends Ordering[(Int, String)] { - override def compare(x: (Int, String), y: (Int, String)): Int - = if (x._2.length > y._2.length) 1 else -1 - } - - @Test - def takeOrdered(): Unit = { - val list = List((1, "hadoop"), (1, "storm"), (1, "azkaban"), (1, "hive")) - // 定义隐式默认值 - implicit val implicitOrdering = new CustomOrdering - sc.parallelize(list).takeOrdered(5) - } - - - @Test - def countByKey(): Unit = { - val list = List(("hadoop", 10), ("hadoop", 10), ("storm", 3), ("storm", 3), ("azkaban", 1)) - sc.parallelize(list).countByKey() - } - - @Test - def saveAsTextFile(): Unit = { - val list = List(("hadoop", 10), ("hadoop", 10), ("storm", 3), ("storm", 3), ("azkaban", 1)) - sc.parallelize(list).saveAsTextFile("/usr/file/temp") - } - - @Test - def saveAsSequenceFile(): Unit = { - val list = List(("hadoop", 10), ("hadoop", 10), ("storm", 3), ("storm", 3), ("azkaban", 1)) - sc.parallelize(list).saveAsSequenceFile("/usr/file/sequence") - } - - - @After - def destroy(): Unit = { - sc.stop() - } - - -} \ No newline at end of file diff --git a/code/spark/spark-core/src/main/java/rdd/scala/WordCount.scala b/code/spark/spark-core/src/main/java/rdd/scala/WordCount.scala deleted file mode 100644 index a09daf6..0000000 --- a/code/spark/spark-core/src/main/java/rdd/scala/WordCount.scala +++ /dev/null @@ -1,14 +0,0 @@ -package rdd.scala - -import org.apache.spark.{SparkConf, SparkContext} - - -object WordCount extends App { - - val conf = new SparkConf().setAppName("sparkBase").setMaster("local[2]") - val sc = new SparkContext(conf) - val rdd = sc.textFile("input/wc.txt").flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _) - rdd.foreach(println) - rdd.saveAsTextFile("output/") - -} \ No newline at end of file diff --git a/resources/Reformatting.java b/resources/Reformatting.java deleted file mode 100644 index 9a70635..0000000 --- a/resources/Reformatting.java +++ /dev/null @@ -1,167 +0,0 @@ -import javafx.util.Pair; - -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import static java.util.regex.Pattern.*; - -/** - * @author : heibaiying - * @description : 生成导航和图片格式转换 - */ -public class Reformatting { - - /** - * GITHUB 用户名 - **/ - private static final String GITHUB_USERNAME = "heibaiying"; - /** - * 项目地址 - **/ - private static final String PROJECT_NAME = "BigData-Notes"; - - public static void main(String[] args) { - - if (args.length < 1) { - System.out.println("请输入文件路径"); - return; - } - - String dir = "D:\\BigData-Notes\\notes\\Hbase协处理器.md"; - - String preUrl = "https://github.com/" + GITHUB_USERNAME + "/" + PROJECT_NAME + "/blob/master/pictures/"; - String regex = "(!\\[(\\S*)]\\(\\S:\\\\" + PROJECT_NAME + "\\\\pictures\\\\(\\S*)\\)[^(
)]*?)"; - - List filesList = getAllFile(dir, new ArrayList<>()); - for (String filePath : filesList) { - // 获取文件内容 - String content = getContent(filePath); - // 修改图片 - String newContent = changeImageUrl(content, preUrl, regex); - // 获取全部标题 - List> allTitle = getAllTitle(newContent); - // 生成导航 - String nav = genNav(allTitle); - // 写出并覆盖原文件 - write(filePath, newContent, nav); - } - System.out.println("格式转换成功!"); - } - - - private static String changeImageUrl(String content, String preUrl, String oldImageUrlRegex) { - - //github 支持的居中方式
- return content.replaceAll(oldImageUrlRegex, - String.format("
", preUrl)); - - } - - private static List getAllFile(String dir, List filesList) { - File file = new File(dir); - //如果是文件 则不遍历 - if (file.isFile() && file.getName().endsWith(".md")) { - filesList.add(file.getAbsolutePath()); - } - //如果是文件夹 则遍历下面的所有文件 - File[] files = file.listFiles(); - if (files != null) { - for (File f : files) { - if (f.isDirectory() && !f.getName().startsWith(".")) { - getAllFile(f.getAbsolutePath(), filesList); - } else if (f.getName().endsWith(".md")) { - filesList.add(f.getAbsolutePath()); - } - } - } - return filesList; - } - - - private static void write(String filePath, String content, String nav) { - try { - String newContent = ""; - if (content.contains("")) { - // 如果原来有目录则替换 - newContent = content.replaceAll("(?m)()", nav); - } else { - StringBuilder stringBuilder = new StringBuilder(content); - // 如果原来没有目录,则title和正文一个标题间写入 - int index = content.indexOf("## "); - stringBuilder.insert(index - 1, nav); - newContent = stringBuilder.toString(); - } - // 写出覆盖文件 - FileWriter fileWriter = new FileWriter(new File(filePath)); - fileWriter.write(newContent); - fileWriter.flush(); - } catch (IOException e) { - e.printStackTrace(); - } - - } - - private static String genNav(List> flagAndTitles) { - StringBuilder builder = new StringBuilder(); - // 目录头 - builder.append("\n"); - return builder.toString(); - } - - private static String genBlank(int i, int scale) { - StringBuilder builder = new StringBuilder(); - for (int j = 0; j < i; j++) { - for (int k = 0; k < scale; k++) { - builder.append(" "); - } - } - return builder.toString(); - } - - private static List> getAllTitle(String content) { - List> list = new ArrayList<>(); - Pattern pattern = compile("(?m)^(#{2,10})\\s?(.*)"); - Matcher matcher = pattern.matcher(content); - while (matcher.find()) { - String group2 = matcher.group(2); - if (!group2.contains("参考资料")) { - list.add(new Pair<>(matcher.group(1), group2)); - } - } - return list; - } - - private static String getContent(String filePath) { - StringBuilder builder = new StringBuilder(); - - try { - FileReader reader = new FileReader(filePath); - char[] chars = new char[1024 * 1024]; - - int read; - while ((read = reader.read(chars)) != -1) { - builder.append(new String(chars, 0, read)); - } - } catch (IOException e) { - e.printStackTrace(); - } - return builder.toString(); - } - -}