This program creates a small table with three people. It saves the data in Avro, Parquet, and ORC formats. Then it reads each format back and shows the data. This helps see how serialization works in Hadoop with Spark.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SerializationExample').getOrCreate()
# Create sample data
data = [(1, 'Alice', 29), (2, 'Bob', 31), (3, 'Cathy', 25)]
columns = ['id', 'name', 'age']
df = spark.createDataFrame(data, columns)
# Save data in Avro format
avro_path = '/tmp/data_avro'
df.write.format('avro').mode('overwrite').save(avro_path)
# Read back Avro data
df_avro = spark.read.format('avro').load(avro_path)
print('Avro Data:')
df_avro.show()
# Save data in Parquet format
parquet_path = '/tmp/data_parquet'
df.write.format('parquet').mode('overwrite').save(parquet_path)
# Read back Parquet data
df_parquet = spark.read.format('parquet').load(parquet_path)
print('Parquet Data:')
df_parquet.show()
# Save data in ORC format
orc_path = '/tmp/data_orc'
df.write.format('orc').mode('overwrite').save(orc_path)
# Read back ORC data
df_orc = spark.read.format('orc').load(orc_path)
print('ORC Data:')
df_orc.show()
spark.stop()