modify
This commit is contained in:
		| @@ -10,45 +10,47 @@ object SparkSqlApp { | ||||
|   def main(args: Array[String]): Unit = { | ||||
|  | ||||
|     val spark = SparkSession.builder().appName("aggregations").master("local[2]").getOrCreate() | ||||
|  | ||||
|     val empDF = spark.read.json("/usr/file/json/emp.json") | ||||
|     empDF.createOrReplaceTempView("emp") | ||||
|     empDF.show() | ||||
|  | ||||
|     empDF.select(count("ename")).show() | ||||
|     empDF.select(countDistinct("deptno")).show() | ||||
|     empDF.select(approx_count_distinct("ename", 0.1)).show() | ||||
|     empDF.select(first("ename"), last("job")).show() | ||||
|     empDF.select(min("sal"), max("sal")).show() | ||||
|     empDF.select(sum("sal")).show() | ||||
|     empDF.select(sumDistinct("sal")).show() | ||||
|     empDF.select(avg("sal")).show() | ||||
|     val deptDF = spark.read.json("/usr/file/json/dept.json") | ||||
|     deptDF.createOrReplaceTempView("dept") | ||||
|  | ||||
|     deptDF.printSchema() | ||||
|  | ||||
|     // 1.定义联结表达式 | ||||
|     val joinExpression = empDF.col("deptno") === deptDF.col("deptno") | ||||
|     // 2.联结查询 | ||||
|     empDF.join(deptDF, joinExpression).select("ename", "dname").show() | ||||
|     spark.sql("SELECT ename,dname FROM emp JOIN dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|  | ||||
|     // 总体方差 均方差 总体标准差 样本标准差 | ||||
|     empDF.select(var_pop("sal"), var_samp("sal"), stddev_pop("sal"), stddev_samp("sal")).show() | ||||
|     empDF.join(deptDF, joinExpression, "outer").show() | ||||
|     spark.sql("SELECT * FROM emp FULL OUTER JOIN dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|     empDF.join(deptDF, joinExpression, "left_outer").show() | ||||
|     spark.sql("SELECT * FROM emp LEFT OUTER JOIN dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|     // 偏度和峰度 | ||||
|     empDF.select(skewness("sal"), kurtosis("sal")).show() | ||||
|     empDF.join(deptDF, joinExpression, "right_outer").show() | ||||
|     spark.sql("SELECT * FROM emp RIGHT OUTER JOIN dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|     // 计算两列的 皮尔逊相关系数 样本协方差 总体协方差 | ||||
|     empDF.select(corr("empno", "sal"), covar_samp("empno", "sal"), | ||||
|       covar_pop("empno", "sal")).show() | ||||
|     empDF.join(deptDF, joinExpression, "left_semi").show() | ||||
|     spark.sql("SELECT * FROM emp LEFT SEMI JOIN dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|     empDF.agg(collect_set("job"), collect_list("ename")).show() | ||||
|     empDF.join(deptDF, joinExpression, "left_anti").show() | ||||
|     spark.sql("SELECT * FROM emp LEFT ANTI dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|  | ||||
|     empDF.groupBy("deptno", "job").count().show() | ||||
|     spark.sql("SELECT deptno, job, count(*) FROM emp GROUP BY deptno, job").show() | ||||
|  | ||||
|     empDF.groupBy("deptno").agg(count("ename").alias("人数"), sum("sal").alias("总工资")).show() | ||||
|     spark.sql("SELECT deptno, count(ename) ,sum(sal) FROM emp GROUP BY deptno").show() | ||||
|  | ||||
|  | ||||
|     empDF.groupBy("deptno").agg("ename"->"count","sal"->"sum").show() | ||||
|     /*你绝对应该使用交叉连接,100%确定这是你需要的。 在Spark中定义交叉连接时,有一个原因需要明确。 他们很危险! | ||||
|     高级用户可以将会话级配置spark.sql.crossJoin.enable设置为true,以便允许交叉连接而不发出警告,或者Spark没有尝试为您执行另一个连接。*/ | ||||
|     empDF.join(deptDF, joinExpression, "cross").show() | ||||
|     spark.sql("SELECT * FROM emp CROSS JOIN dept ON emp.deptno = dept.deptno").show() | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|     spark.sql("SELECT * FROM graduateProgram NATURAL JOIN person").show() | ||||
|   } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user