Create Date Dimension
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
spark = SparkSession.builder.getOrCreate()
fin_year_start_month = 7 #financial year starts from July
#create a range of dates
dates = [[int(t.strftime('%Y%m%d')), t.strftime('%Y-%m-%d')] for t in pd.date_range(start_date, end_date, freq='D').to_list()]
df = spark.createDataFrame(dates, schema=['ID', 'Date'])
#calculate the other date dimension attributes
df = (
df.withColumn('MonthStartDate', F.to_date(F.date_trunc('month', 'Date')))
.withColumn('MonthEndDate', F.last_day('Date'))
.withColumn('WeekStartDate', F.to_date(F.date_trunc('week', 'Date'))) #Starts from Sunday
.withColumn('WeekEndDate', F.date_add('WeekStartDate', 6))
.withColumn('CalendarYear', F.year('Date'))
.withColumn('CalendarYearQuarter', F.concat(F.year('Date'), F.lit('-'), F.date_format('Date', 'QQQ')))
.withColumn('CalendarQuarter', F.date_format('Date', 'QQQ'))
.withColumn('CalendarQuarterNo', F.quarter('Date'))
.withColumn('CalendarYearMonth', F.concat(F.year('Date'),F.lit('-'), F.month('Date')))
.withColumn('CalendarYearMonthNo', F.month('Date'))
.withColumn('MonthNameFull', F.date_format('Date', 'MMMM'))
.withColumn('MonthNameShort', F.date_format('Date', 'MMM'))
.withColumn('CalendarYearWeek', F.concat(F.year('Date'),F.lit('-'), F.weekofyear('Date')))
.withColumn('CalendarYearWeekNo', F.weekofyear('Date'))
.withColumn('CalendarMonthDayNo', F.dayofmonth('Date'))
.withColumn('WeekDayNo', F.dayofweek('Date'))
.withColumn('DayNameFull', F.date_format('Date', 'EEEE'))
.withColumn('DayNameShort', F.date_format('Date', 'E'))
.withColumn('FinancialYear', F.when(F.month('Date') >= fin_year_start_month, F.year('Date')+1).otherwise(F.year('Date')))
.withColumn('FinancialQuarter', F.date_format(F.add_months('Date', -6), 'QQQ'))
.withColumn('FinancialQuarterNo', F.quarter(F.add_months('Date', -6)))
.withColumn('FinancialYearMonthNo', F.month(F.add_months('Date', -6)))
.withColumn('FinancialYearMonth', F.concat('FinancialYear',F.lit('-'), 'FinancialYearMonthNo'))
.withColumn('FinancialYearStartDate', F.concat(F.col('FinancialYear')-1,F.lit('-08-01')))
.withColumn('FinancialYearWeekNo', F.weekofyear(F.add_months('Date', -6)))
.withColumn('FinancialYearWeek', F.concat('FinancialYear', F.lit('-'), 'FinancialYearWeekNo'))
.withColumn('FinancialYearQuarter',F.concat('FinancialYear', F.lit('-'), 'FinancialQuarterNo'))
.withColumn('DaysPerMonth', F.date_format(F.last_day('Date'), 'd'))
.withColumn('IsWeekDay', F.when((F.col('WeekDayNo') > 1) & (F.col('WeekDayNo') < 7), True).otherwise(False))
.withColumn('YYYYMMDD', F.col('ID'))
)
df.show(50)