This program creates a small table of people with their age and salary. It then shows how to pick columns and filter rows using select, filter, and where.
from pyspark.sql import SparkSession
# Start Spark session
spark = SparkSession.builder.appName("SelectFilterWhereExample").getOrCreate()
# Create sample data
data = [
("Alice", 30, 60000),
("Bob", 22, 40000),
("Charlie", 35, 70000),
("David", 28, 45000)
]
# Define columns
columns = ["name", "age", "salary"]
# Create DataFrame
df = spark.createDataFrame(data, columns)
# Select name and salary columns
selected_df = df.select("name", "salary")
# Filter rows where age is greater than 25
filtered_df = df.filter(df["age"] > 25)
# Use where to filter rows where salary is greater than 50000
where_df = df.where("salary > 50000")
# Show results
print("Selected columns (name, salary):")
selected_df.show()
print("Filtered rows (age > 25):")
filtered_df.show()
print("Where rows (salary > 50000):")
where_df.show()
# Stop Spark session
spark.stop()