This program creates a small table with some missing ages and repeated rows. It shows how to find missing ages, detect duplicates, and remove duplicates.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NullDuplicateDemo').getOrCreate()
# Sample data with nulls and duplicates
data = [
(1, 'Alice', 25),
(2, 'Bob', None),
(3, 'Charlie', 30),
(4, 'Bob', None),
(5, None, 22),
(1, 'Alice', 25)
]
columns = ['id', 'name', 'age']
df = spark.createDataFrame(data, columns)
print('Original DataFrame:')
df.show()
print('Rows with nulls in age column:')
df.filter(df['age'].isNull()).show()
print('Duplicate rows:')
df.groupBy(df.columns).count().filter('count > 1').show()
print('DataFrame after dropping duplicates:')
df.dropDuplicates().show()
spark.stop()