This program creates a small dataset with missing values. It then removes rows where 'age' or 'salary' is missing or invalid. This prevents errors in later steps that use these columns.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan
# Start Spark session
spark = SparkSession.builder.appName('DataQualityExample').getOrCreate()
# Sample data with some bad rows
data = [
(1, 'Alice', 25, 50000),
(2, 'Bob', None, 60000),
(3, 'Charlie', 30, None),
(4, None, 22, 45000),
(5, 'Eve', 28, 70000)
]
columns = ['id', 'name', 'age', 'salary']
raw_data = spark.createDataFrame(data, columns)
# Show raw data
print('Raw data:')
raw_data.show()
# Clean data: remove rows with missing age or salary
clean_data = raw_data.filter(
(col('age').isNotNull()) & (~isnan(col('age'))) &
(col('salary').isNotNull()) & (~isnan(col('salary')))
)
print('Clean data:')
clean_data.show()
spark.stop()