//Reading file without options 

val peopledf = spark.read.csv(“file:///home/geoinsys/spark-2.4.5-bin-hadoop2.7/examples/src/main/resources/people.csv”)

//To view the schema of the dataframe


//Reading file with delimiter

val peopledfdelimited = spark.read.option(“sep”,”,”).csv(“file:///home/geoinsys/spark-2.4.5-bin-hadoop2.7/examples/src/main/resources/people.csv”)

//reading csv file with header,schema,delimiter 

val dfschema = spark.read.option(“header”,”true”).option(“inferSchema”,”true”).option(“sep”,”,”).csv(“file:///home/geoinsys/spark-2.4.5-bin-hadoop2.7/examples/src/main/resources/people.csv”)

—————————————————-Handling superstore.csv ————————–

//Download the superstore.csv file from Datasets drive folder

val salesDF=spark.read.option(“header”,”true”).option(“inferSchema”,”true”).option(“sep”,”,”).csv(“file:///home/geoinsys/Downloads/datasets/superstore.csv”)

//To view the Schema


//To print the data in console using show ,it displays only 20 rows 


//To View the count 


//To know the number of partitions 


————-Basic operations on the dataframe —————

//To register SalesDF as register Temporary  table


//spark sql  to process the sales temp table 

spark.sql(“select Country,Category,sum(Profit) from Sales group by Country,Category).show 

//cast  string to Long 

val totsales = salesDF.selectExpr(“Region”,”Country”,”State”,”Category”,”cast(Profit as Long)as Profit”)

//Only to view US sales data 

val sales_US =totsales.filter(“Country=’United States'”).orderBy(“Country”).groupBy(“Country”,”Category”).agg(sum(“Profit”)as “Sales”)


//reading data from json file 

val dfjson = spark.read.option(“InferSchema”,”true”).json(“file:///home/geoinsys/spark-2.4.5-bin-hadoop2.7/examples/src/main/resources/people.json”)

 //reading data from json file (nested)

sample file –> nested. json  =>  {“col1”:{“col2″:”val2″,”col3”:[“arr1″,”arr2”]}}




spark.sql(“select col1.col3[0] from jsonnested”).show

//reading data from Avro file 

val avrodf = spark.read.format(“avro”).load(“people.avro”)

//reading avro using avro schema file 

val schemaAvro = new Schema.Parser()

      .parse(new File(“people.avsc”))

val peopledf = spark.read


              .option(“avroSchema”, schemaAvro.toString)


//To save the file   in hdfs 


//To save the dataframe as table



