import re # Import the regular expression module
prog = re.compile('\d{3}-\d{3}-\d{4}') # Compile the pattern (phone number of the format xxx-xxx-xxxx)
result = prog.match('123-456-7890') # See if the pattern matches
print(bool(result))
import re # Import the regular expression module
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana') # Find the numeric values in the string
print(matches)
pattern1 = bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890')) # The first pattern
print(pattern1)
pattern2 = bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45')) # The second pattern
print(pattern2)
pattern3 = bool(re.match(pattern='\w*', string='Australia')) # The third pattern
print(pattern3)
#1. Importing the required packagesimport matplotlib.pyplot as pltimport seaborn as snsimport warnings #To hide Warning messageswarnings.filterwarnings('ignore')import pandas as pdimport numpy as npfrom sklearn import preprocessingimport re# 2. Reading Datadf = pd.read_csv("dataset.csv")print(df.head())# 3. Data Preprocessing# 3.1 Handling NULL or Duplicate Values# Print sum in df contain nullprint(df.isna().sum())# Remove rows with null valuedf = df.dropna()#print(df.head())# Remove the duplicated entries in the dataset# In df remove duplicate rowsdf = df.drop_duplicates()# print(df)# Display the size of the dataset readprint(df.shape)# Analyze their ratings and performanceprint("Rating mean:", df.Rating.mean())print("Rating median:", df.Rating.median())print("Rating mode:", df.Rating.mode())print("Rating min:", df.Rating.min())print("Rating max:", df.Rating.max())# 3.2 Handling Data Types of each Feature# The feature Reviews must be of int type.print(df.dtypes) # Displaying Data types of each feature.# In df column Reviews change the type into int64df['Reviews'] = df['Reviews'].astype(int)print(df.dtypes)# Changing the Feature : Installs (Example : '10,000+' to 10000)newInstalls = [] # given a new list newInstallsfor row in df.Installs: # loop through the df.Install to do the change row = row.replace("+", "") # remove the last char + newRow = row.replace(",", "") # use row content to remove , newInstalls.append(float(newRow)) # convert newRow to floatdf.Installs = newInstalls # update the new content into df column Installsdf.Installs.head()# Changing the feature : Size (Change 15M to 15.0)newSize = []for row in df.Size: newrow = row.replace("M", "") # remove last char M try: newSize.append(float(newrow))# convert the newrow into float except: newSize.append(0) #When it says - Size Varies.df.Size = newSize #update the newSize into df Size columnprint(df.Size.head())# Changing the feature : Price (remove '$' and convert to float)newPrice = []for row in df.Price: if row != 0: #add code: to compare row not equal to 0: #now this row content is likely eg. $2.44 newrow = float(row.replace("$","")) # Extract row content without '$' then convert to float else: newrow = 0 newPrice.append(newrow)df.Price = newPriceprint(df.Price.head())# Changing the feature: Android Ver ("4.0.3 and up" to "4.0")newVer = []for row in df['Android Ver']: newrow = row[0:3] if (row != "Varies with device") else (row) try: newrow = float(newrow) except: newrow = 0 # When the value is - Varies with device newVer.append(newrow)df['Android Ver'] = newVerdf['Android Ver'].value_counts() # add code:total count for each Android Verprint(df['Android Ver'])# print(df) #check the result in df# print(df.dtypes)# 4. Analyzing Features# 4.1 Categories (Find the Total count for each of the Category)print(df.Category.value_counts()) # To get total count of each Category# Group using the column 'Type', 'Category' and aggregate Rating with the meaninfo = df.groupby(["Type", "Category"])["Rating"].mean() # Using df then Group using the column# 'Type', 'Category'and aggregate Rating with the meanprint(info)# 4.2 Price# Find the most costly App in the PlayStore.print(df[df["Price"] == df["Price"].max()])# 5. Further Analysis# Find all Apps with 5.0 ratingsdf_full = df.loc[df['Rating'] == 5.0]#df_full = df.loc[df['Rating'] == 5.0,['App']]df_full.head()# Find skew of column Reviews==5.0 Rating Appssns.distplot(df_full.Reviews)plt.show()# Find the skew value for the column review==5skewValue = df_full[df_full["Reviews"] == 5].skew(axis=1)print(skewValue)# Category and Reviews #List the unique Categories #Display the average review for each unique Category #Sort and display the results using descending reviewcat_list = list(df['Category'].unique()) # add codecat_review = []for i in cat_list: x = df["Reviews"] if (len(x) != 0): review = df.groupby(['Category'])['Reviews'].mean() cat_review = review.valuesdata_cat_rev = pd.DataFrame({"category": cat_list, "review": cat_review})data_cat_rev = data_cat_rev.sort_values(["review"], ascending=False)print(data_cat_rev)# 6. Export the file to CSV#df.to_csv("try.csv")