Split string column in Spark

The split function in Spark splits a string column into a column of array, and then it can get each item of the array to form new column.

from pyspark.sql import SparkSession

from pyspark.sql import functions as F

spark=SparkSession.builder.getOrCreate()

data= [('James','','Smith','1991-04-01'),

  ('Michael','Rose','','2000-05-19'),

  ('Robert','','Williams','1978-09-05'),

  ('Maria','Anne','Jones','1967-12-01'),

  ('Jen','Mary','Brown','1980-02-17')

]

df = spark.createDataFrame(data, schema = 'firstname String, midname String, lastname string, dob string')


#here it splits the same column multiple times and get the item 0, 1, 2.

df1 = df.withColumn('year', F.split(df['dob'], '-').getItem(0)) \

       .withColumn('month', F.split(df['dob'], '-').getItem(1)) \

       .withColumn('day', F.split(df['dob'], '-').getItem(2))

df1.show(truncate=False)


#a performance improvement is splitting once and save it as split_record.

# then access the split record multiple times.

split_record = F.split(df['dob'], '-')

df2 = df.withColumn('year', split_record.getItem(0)) \

       .withColumn('month', split_record.getItem(1)) \

       .withColumn('day',split_record.getItem(2))

df2.show(truncate=False)