This program creates a DataFrame with ages as strings, some nulls and 'NaN' strings. It replaces 'NaN' with null, then casts the age to integer and replaces nulls with 0.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
spark = SparkSession.builder.appName('TypeCastingNullHandling').getOrCreate()
# Sample data with string ages and some nulls
data = [
('Alice', '23'),
('Bob', None),
('Charlie', '35'),
('David', 'NaN'),
('Eve', None)
]
columns = ['name', 'age']
df = spark.createDataFrame(data, columns)
# Show original data
print('Original Data:')
df.show()
# Replace 'NaN' string with null
from pyspark.sql.functions import when
df = df.withColumn('age', when(col('age') == 'NaN', None).otherwise(col('age')))
# Cast age to integer and replace nulls with 0
df = df.withColumn('age_int', when(col('age').isNull(), 0).otherwise(col('age').cast('int')))
print('After type casting and null handling:')
df.show()
spark.stop()