This program creates a small dataset, saves it as a Delta Lake table, updates some rows, and shows the updated data.
from pyspark.sql import SparkSession
from delta import *
spark = SparkSession.builder.appName("DeltaLakeExample") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()
# Create sample data
data = [(1, "click", 10), (2, "view", 20), (3, "click", 5)]
columns = ["id", "eventType", "count"]
df = spark.createDataFrame(data, columns)
# Save as Delta Lake table
path = "/tmp/delta/events"
df.write.format("delta").mode("overwrite").save(path)
deltaTable = DeltaTable.forPath(spark, path)
# Update count where eventType is 'click'
deltaTable.update("eventType = 'click'", {"count": "count + 1"})
# Show updated data
updated_df = spark.read.format("delta").load(path)
updated_df.show()