This commit is contained in:
罗祥
2019-05-21 17:36:54 +08:00
parent 0dd9d8862f
commit 38c890d5e3
6 changed files with 231 additions and 59 deletions

View File

@ -10,45 +10,47 @@ object SparkSqlApp {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("aggregations").master("local[2]").getOrCreate()
val empDF = spark.read.json("/usr/file/json/emp.json")
empDF.createOrReplaceTempView("emp")
empDF.show()
empDF.select(count("ename")).show()
empDF.select(countDistinct("deptno")).show()
empDF.select(approx_count_distinct("ename", 0.1)).show()
empDF.select(first("ename"), last("job")).show()
empDF.select(min("sal"), max("sal")).show()
empDF.select(sum("sal")).show()
empDF.select(sumDistinct("sal")).show()
empDF.select(avg("sal")).show()
val deptDF = spark.read.json("/usr/file/json/dept.json")
deptDF.createOrReplaceTempView("dept")
deptDF.printSchema()
// 1.定义联结表达式
val joinExpression = empDF.col("deptno") === deptDF.col("deptno")
// 2.联结查询
empDF.join(deptDF, joinExpression).select("ename", "dname").show()
spark.sql("SELECT ename,dname FROM emp JOIN dept ON emp.deptno = dept.deptno").show()
// 总体方差 均方差 总体标准差 样本标准差
empDF.select(var_pop("sal"), var_samp("sal"), stddev_pop("sal"), stddev_samp("sal")).show()
empDF.join(deptDF, joinExpression, "outer").show()
spark.sql("SELECT * FROM emp FULL OUTER JOIN dept ON emp.deptno = dept.deptno").show()
empDF.join(deptDF, joinExpression, "left_outer").show()
spark.sql("SELECT * FROM emp LEFT OUTER JOIN dept ON emp.deptno = dept.deptno").show()
// 偏度和峰度
empDF.select(skewness("sal"), kurtosis("sal")).show()
empDF.join(deptDF, joinExpression, "right_outer").show()
spark.sql("SELECT * FROM emp RIGHT OUTER JOIN dept ON emp.deptno = dept.deptno").show()
// 计算两列的 皮尔逊相关系数 样本协方差 总体协方差
empDF.select(corr("empno", "sal"), covar_samp("empno", "sal"),
covar_pop("empno", "sal")).show()
empDF.join(deptDF, joinExpression, "left_semi").show()
spark.sql("SELECT * FROM emp LEFT SEMI JOIN dept ON emp.deptno = dept.deptno").show()
empDF.agg(collect_set("job"), collect_list("ename")).show()
empDF.join(deptDF, joinExpression, "left_anti").show()
spark.sql("SELECT * FROM emp LEFT ANTI dept ON emp.deptno = dept.deptno").show()
empDF.groupBy("deptno", "job").count().show()
spark.sql("SELECT deptno, job, count(*) FROM emp GROUP BY deptno, job").show()
empDF.groupBy("deptno").agg(count("ename").alias("人数"), sum("sal").alias("总工资")).show()
spark.sql("SELECT deptno, count(ename) ,sum(sal) FROM emp GROUP BY deptno").show()
empDF.groupBy("deptno").agg("ename"->"count","sal"->"sum").show()
/*你绝对应该使用交叉连接100确定这是你需要的。 在Spark中定义交叉连接时有一个原因需要明确。 他们很危险!
高级用户可以将会话级配置spark.sql.crossJoin.enable设置为true以便允许交叉连接而不发出警告或者Spark没有尝试为您执行另一个连接。*/
empDF.join(deptDF, joinExpression, "cross").show()
spark.sql("SELECT * FROM emp CROSS JOIN dept ON emp.deptno = dept.deptno").show()
spark.sql("SELECT * FROM graduateProgram NATURAL JOIN person").show()
}
}