Split string column in Spark
The split function in Spark splits a string column into a column of array, and then it can get each item of the array to form new column.
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark=SparkSession.builder.getOrCreate()
data= [('James','','Smith','1991-04-01'),
('Michael','Rose','','2000-05-19'),
('Robert','','Williams','1978-09-05'),
('Maria','Anne','Jones','1967-12-01'),
('Jen','Mary','Brown','1980-02-17')
]
df = spark.createDataFrame(data, schema = 'firstname String, midname String, lastname string, dob string')
#here it splits the same column multiple times and get the item 0, 1, 2.
df1 = df.withColumn('year', F.split(df['dob'], '-').getItem(0)) \
.withColumn('month', F.split(df['dob'], '-').getItem(1)) \
.withColumn('day', F.split(df['dob'], '-').getItem(2))
df1.show(truncate=False)
#a performance improvement is splitting once and save it as split_record.
# then access the split record multiple times.
split_record = F.split(df['dob'], '-')
df2 = df.withColumn('year', split_record.getItem(0)) \
.withColumn('month', split_record.getItem(1)) \
.withColumn('day',split_record.getItem(2))
df2.show(truncate=False)