RyanGoh's Eportfolio

5. Cleaning data

Using Regular Expressions to clean strings

String parsing with regular expressions

import re # Import the regular expression module

prog = re.compile('\d{3}-\d{3}-\d{4}') # Compile the pattern (phone number of the format xxx-xxx-xxxx)

result = prog.match('123-456-7890') # See if the pattern matches

print(bool(result))

Extracting numerical values from strings

import re # Import the regular expression module

matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana') # Find the numeric values in the string

print(matches)

Pattern matching

pattern1 = bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890')) # The first pattern

print(pattern1)

pattern2 = bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45')) # The second pattern

print(pattern2)

pattern3 = bool(re.match(pattern='\w*', string='Australia')) # The third pattern

print(pattern3)

Cleaning Data Example 1

#1. Importing the required packages

import matplotlib.pyplot as plt

import seaborn as sns

import warnings #To hide Warning messages

warnings.filterwarnings('ignore')

import pandas as pd

import numpy as np

from sklearn import preprocessing

import re

# 2. Reading Data

df = pd.read_csv("dataset.csv")

print(df.head())

# 3. Data Preprocessing

# 3.1 Handling NULL or Duplicate Values

# Print sum in df contain null

print(df.isna().sum())

# Remove rows with null value

df = df.dropna()

#print(df.head())

# Remove the duplicated entries in the dataset

# In df remove duplicate rows

df = df.drop_duplicates()

# print(df)

# Display the size of the dataset read

print(df.shape)

# Analyze their ratings and performance

print("Rating mean:", df.Rating.mean())

print("Rating median:", df.Rating.median())

print("Rating mode:", df.Rating.mode())

print("Rating min:", df.Rating.min())

print("Rating max:", df.Rating.max())

# 3.2 Handling Data Types of each Feature

# The feature Reviews must be of int type.

print(df.dtypes)  # Displaying Data types of each feature.

# In df column Reviews change the type into int64

df['Reviews'] = df['Reviews'].astype(int)

print(df.dtypes)

# Changing the Feature : Installs (Example : '10,000+' to 10000)

newInstalls = []  # given a new list newInstalls

for row in df.Installs: # loop through the df.Install to do the change

    row = row.replace("+", "")  # remove the last char +

    newRow = row.replace(",", "")  # use row content to remove ,

    newInstalls.append(float(newRow))  # convert newRow to float

df.Installs = newInstalls  # update the new content into df column Installs

df.Installs.head()

# Changing the feature : Size (Change 15M to 15.0)

newSize = []

for row in df.Size:

    newrow = row.replace("M", "") # remove last char M

    try:

        newSize.append(float(newrow))# convert the newrow into float

    except:

        newSize.append(0) #When it says - Size Varies.

df.Size = newSize #update the newSize into df Size column

print(df.Size.head())

# Changing the feature : Price (remove '$' and convert to float)

newPrice = []

for row in df.Price:

    if row != 0: #add code: to compare row not equal to 0:

        #now this row content is likely eg. $2.44

        newrow = float(row.replace("$","")) # Extract row content without '$' then convert to float

    else:

        newrow = 0

    newPrice.append(newrow)

df.Price = newPrice

print(df.Price.head())

# Changing the feature: Android Ver ("4.0.3 and up" to "4.0")

newVer = []

for row in df['Android Ver']:

    newrow = row[0:3] if (row != "Varies with device") else (row)

    try:

        newrow = float(newrow)

    except:

        newrow = 0  # When the value is - Varies with device

    newVer.append(newrow)

df['Android Ver'] =  newVer

df['Android Ver'].value_counts() # add code:total count for each Android Ver

print(df['Android Ver'])

# print(df)  #check the result in df

# print(df.dtypes)

# 4. Analyzing Features

# 4.1 Categories (Find the Total count for each of the Category)

print(df.Category.value_counts()) # To get total count of each Category

# Group using the column 'Type', 'Category' and aggregate Rating with the mean

info = df.groupby(["Type", "Category"])["Rating"].mean() # Using df then Group using the column

# 'Type', 'Category'and aggregate Rating with the mean

print(info)

# 4.2 Price

# Find the most costly App in the PlayStore.

print(df[df["Price"] == df["Price"].max()])

# 5. Further Analysis

# Find all Apps with 5.0 ratings

df_full = df.loc[df['Rating'] == 5.0]

#df_full = df.loc[df['Rating'] == 5.0,['App']]

df_full.head()

# Find skew of column Reviews==5.0 Rating Apps

sns.distplot(df_full.Reviews)

plt.show()

# Find the skew value for the column review==5

skewValue = df_full[df_full["Reviews"] == 5].skew(axis=1)

print(skewValue)

# Category and Reviews

    #List the unique Categories

    #Display the average review for each unique Category

    #Sort and display the results using descending review

cat_list = list(df['Category'].unique())  # add code

cat_review = []

for i in cat_list:

    x = df["Reviews"]

    if (len(x) != 0):

        review = df.groupby(['Category'])['Reviews'].mean()

        cat_review = review.values

data_cat_rev = pd.DataFrame({"category": cat_list,

                             "review": cat_review})

data_cat_rev = data_cat_rev.sort_values(["review"], ascending=False)

print(data_cat_rev)

# 6. Export the file to CSV

#df.to_csv("try.csv")

Google Sites

Report abuse