Untitled

import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types.{StructField, StructType, StringType, DoubleType, IntegerType}
import org.apache.spark.sql.functions.col

// Cambiar la ruta del fichero aquí
val PATH = "/home/usuario/Descargas/abalone.data"

val spark = SparkSession.
    builder().
    appName("Evaluación 1 Parte 2 - Manejo de RDDs y Dataframes en Scala").
    master("local").
    getOrCreate()

val abaloneSchema = StructType(Array(StructField("Sex", StringType, true),
    StructField("Length", DoubleType, true),
    StructField("Diameter", DoubleType, true),
    StructField("Height", DoubleType, true),
    StructField("Whole_weight", DoubleType, true),
    StructField("Shucked_weight", DoubleType, true),
    StructField("Viscera_weight", DoubleType, true),
    StructField("Shell_weight", DoubleType, true),
    StructField("Rings", IntegerType, true)))

val abalone_df = spark.read.format("csv").option("delimiter", ",").schema(abaloneSchema).load(PATH)

def informacionBasica(df: DataFrame): Unit = {
    val cnt = df.count()
    // Este cálculo consiste en crear la columna filaNula, la cual es un '&' lógico todas las celdas de una misma fila,
    // Si c0.isNull() & c2.isNull() & ... & c8.isNull() = true, significa que el registro es nulo
    // Luego se filtra el DataFrame por las filas que cumplen este criterio, y se cuentan los registros
    // Entiendo que con map y reduce se puede implementar esto de forma dinámica pero no lo pude hacer funcionar
    val nullCnt = df.withColumn("filaNula",
        col("Sex").isNull && col("Length").isNull && col("Diameter").isNull && col("Height").isNull && col("Whole_weight").isNull && col("Shucked_weight").isNull && col("Viscera_weight").isNull && col("Shell_weight").isNull && col("Rings").isNull).filter($"filaNula").count()
    val distinctCnt = df.distinct().count()

    println(s"Número de registros: $cnt")
    println(s"Número de registros nulos: $nullCnt")
    println(s"Número de registros distintos: $distinctCnt")
}

println("Información básica")
informacionBasica(abalone_df)

println("Estadísticas básicas de tres campos tipo Real")
abalone_df.select("Height", "Shell_weight", "Diameter").summary("count", "count_distinct", "mean", "stddev", "max", "min").show()

println("Estadísticas básicas del campo tipo Int")
abalone_df.select("Rings").summary("count", "count_distinct", "mean", "stddev", "max", "min").show()

println("Características básicas campo categórico")
abalone_df.createOrReplaceTempView("abalone")

spark.sql("SELECT Sex, COUNT(*) AS `Número de registros` FROM abalone GROUP BY Sex").show()