This code creates a small dataset and checks three data quality rules using assertions. It prints whether each rule passes or fails.
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('DataQualityAssertions').getOrCreate()
# Sample data
data = [
(1, 'Alice', 25, 'alice@example.com', 50000),
(2, 'Bob', 30, None, 60000),
(3, 'Charlie', -5, 'charlie@example.com', 70000),
(4, 'David', 40, 'david@example.com', 25000)
]
columns = ['id', 'name', 'age', 'email', 'salary']
df = spark.createDataFrame(data, columns)
# Assertion 1: No negative ages
no_negative_ages = df.filter(col('age') < 0).count() == 0
# Assertion 2: No missing emails
no_missing_emails = df.filter(col('email').isNull()).count() == 0
# Assertion 3: At most 1 salary below 30000
few_low_salaries = df.filter(col('salary') < 30000).count() <= 1
print(f'No negative ages: {no_negative_ages}')
print(f'No missing emails: {no_missing_emails}')
print(f'At most 1 low salary: {few_low_salaries}')
spark.stop()