from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
def preprocess():
print('Data preprocessing step')
def train_model():
print('Model training step')
def evaluate():
print('Model evaluation step')
def notify():
print('Notify completion')
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': datetime(2024, 1, 1),
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
'ml_pipeline',
default_args=default_args,
description='Simple ML pipeline',
schedule_interval=timedelta(days=1),
catchup=False,
)
preprocess_task = PythonOperator(
task_id='preprocess',
python_callable=preprocess,
dag=dag,
)
train_task = PythonOperator(
task_id='train_model',
python_callable=train_model,
dag=dag,
)
evaluate_task = PythonOperator(
task_id='evaluate',
python_callable=evaluate,
dag=dag,
)
notify_task = PythonOperator(
task_id='notify',
python_callable=notify,
dag=dag,
)
preprocess_task >> train_task >> evaluate_task >> notify_task
This file defines a Directed Acyclic Graph (DAG) named 'ml_pipeline' that runs daily starting from January 1, 2024.
Each PythonOperator runs a Python function representing a step: preprocess, train_model, evaluate, and notify.
The tasks are linked in order so they run one after another.
Retries are set to 1 with a 5-minute delay if a task fails.