import re # Import the regular expression module
prog = re.compile('\d{3}-\d{3}-\d{4}') # Compile the pattern (phone number of the format xxx-xxx-xxxx)
result = prog.match('123-456-7890') # See if the pattern matches
print(bool(result))
import re # Import the regular expression module
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana') # Find the numeric values in the string
print(matches)
pattern1 = bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890')) # The first pattern
print(pattern1)
pattern2 = bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45')) # The second pattern
print(pattern2)
pattern3 = bool(re.match(pattern='\w*', string='Australia')) # The third pattern
print(pattern3)
#1. Importing the required packages
import matplotlib.pyplot as plt
import seaborn as sns
import warnings #To hide Warning messages
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn import preprocessing
import re
# 2. Reading Data
df = pd.read_csv("dataset.csv")
print(df.head())
# 3. Data Preprocessing
# 3.1 Handling NULL or Duplicate Values
# Print sum in df contain null
print(df.isna().sum())
# Remove rows with null value
df = df.dropna()
#print(df.head())
# Remove the duplicated entries in the dataset
# In df remove duplicate rows
df = df.drop_duplicates()
# print(df)
# Display the size of the dataset read
print(df.shape)
# Analyze their ratings and performance
print("Rating mean:", df.Rating.mean())
print("Rating median:", df.Rating.median())
print("Rating mode:", df.Rating.mode())
print("Rating min:", df.Rating.min())
print("Rating max:", df.Rating.max())
# 3.2 Handling Data Types of each Feature
# The feature Reviews must be of int type.
print(df.dtypes) # Displaying Data types of each feature.
# In df column Reviews change the type into int64
df['Reviews'] = df['Reviews'].astype(int)
print(df.dtypes)
# Changing the Feature : Installs (Example : '10,000+' to 10000)
newInstalls = [] # given a new list newInstalls
for row in df.Installs: # loop through the df.Install to do the change
row = row.replace("+", "") # remove the last char +
newRow = row.replace(",", "") # use row content to remove ,
newInstalls.append(float(newRow)) # convert newRow to float
df.Installs = newInstalls # update the new content into df column Installs
df.Installs.head()
# Changing the feature : Size (Change 15M to 15.0)
newSize = []
for row in df.Size:
newrow = row.replace("M", "") # remove last char M
try:
newSize.append(float(newrow))# convert the newrow into float
except:
newSize.append(0) #When it says - Size Varies.
df.Size = newSize #update the newSize into df Size column
print(df.Size.head())
# Changing the feature : Price (remove '$' and convert to float)
newPrice = []
for row in df.Price:
if row != 0: #add code: to compare row not equal to 0:
#now this row content is likely eg. $2.44
newrow = float(row.replace("$","")) # Extract row content without '$' then convert to float
else:
newrow = 0
newPrice.append(newrow)
df.Price = newPrice
print(df.Price.head())
# Changing the feature: Android Ver ("4.0.3 and up" to "4.0")
newVer = []
for row in df['Android Ver']:
newrow = row[0:3] if (row != "Varies with device") else (row)
try:
newrow = float(newrow)
except:
newrow = 0 # When the value is - Varies with device
newVer.append(newrow)
df['Android Ver'] = newVer
df['Android Ver'].value_counts() # add code:total count for each Android Ver
print(df['Android Ver'])
# print(df) #check the result in df
# print(df.dtypes)
# 4. Analyzing Features
# 4.1 Categories (Find the Total count for each of the Category)
print(df.Category.value_counts()) # To get total count of each Category
# Group using the column 'Type', 'Category' and aggregate Rating with the mean
info = df.groupby(["Type", "Category"])["Rating"].mean() # Using df then Group using the column
# 'Type', 'Category'and aggregate Rating with the mean
print(info)
# 4.2 Price
# Find the most costly App in the PlayStore.
print(df[df["Price"] == df["Price"].max()])
# 5. Further Analysis
# Find all Apps with 5.0 ratings
df_full = df.loc[df['Rating'] == 5.0]
#df_full = df.loc[df['Rating'] == 5.0,['App']]
df_full.head()
# Find skew of column Reviews==5.0 Rating Apps
sns.distplot(df_full.Reviews)
plt.show()
# Find the skew value for the column review==5
skewValue = df_full[df_full["Reviews"] == 5].skew(axis=1)
print(skewValue)
# Category and Reviews
#List the unique Categories
#Display the average review for each unique Category
#Sort and display the results using descending review
cat_list = list(df['Category'].unique()) # add code
cat_review = []
for i in cat_list:
x = df["Reviews"]
if (len(x) != 0):
review = df.groupby(['Category'])['Reviews'].mean()
cat_review = review.values
data_cat_rev = pd.DataFrame({"category": cat_list,
"review": cat_review})
data_cat_rev = data_cat_rev.sort_values(["review"], ascending=False)
print(data_cat_rev)
# 6. Export the file to CSV
#df.to_csv("try.csv")