This code shows a simple Lambda architecture example using Spark on Hadoop. Batch data is processed once, streaming data is processed continuously. Results are printed to console.
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
# Initialize Spark session
spark = SparkSession.builder.appName('LambdaExample').getOrCreate()
# Batch Layer: Read historical data from HDFS
batch_data = spark.read.json('hdfs://path/to/historical/data')
# Process batch data: count events per day
batch_counts = batch_data.groupBy('eventType').count()
# Speed Layer: Simulate streaming data
streaming_data = spark.readStream.schema(batch_data.schema).json('hdfs://path/to/streaming/data')
# Process streaming data: count events in 1 minute windows
stream_counts = streaming_data.groupBy(window(col('timestamp'), '1 minute'), 'eventType').count()
# Start streaming query to console (for demo)
query = stream_counts.writeStream.outputMode('complete').format('console').start()
# Show batch results
batch_counts.show()
# Wait for streaming to finish (in real use, streaming runs continuously)
query.awaitTermination(10)
spark.stop()