本文主要是讲解spark中DataFrame 和SparkSQL的综合使用,以join操作为例。示例代码都是使用java和scala语言编写的。
java版本
package com.dt.sparkql.java; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Tuple2; import java.util.ArrayList; import java.util.List; public class SparkSqlWithJoin { public static void main(String[] args) { /** * 创建spark配置对象,设置spark程序的运行时配置信息,例如通过setMaster设置集群