string similarity
import textdistance as td
import re
'''############## email ##################'''
email_a = 'johnsmith@gmail.com'
email_b = 'johnsmith09@hotmail.com'
name_a, provider_a = email_a.split('@')
name_b, provider_b = email_b.split('@')
#strip non letters
name_a = re.sub(r'[^a-zA-Z]', '', name_a)
name_b = re.sub(r'[^a-zA-Z]', '', name_b)
print('levenshtein similarity {0}'.format(td.damerau_levenshtein.normalized_similarity(name_a,name_b)))
print('longest common substring similarity {0}'.format(td.lcsstr.normalized_similarity(name_a,name_b)))
print('prefix similarity {0}'.format(td.prefix.normalized_similarity(name_a,name_b)))
print('suffix similarity {0}'.format(td.postfix.normalized_similarity(name_a,name_b)))
'''############## firstname lastname ##################'''
#match rating approach
first_name_a = 'mary'
last_name_a = 'smith'
first_name_b = 'merry'
last_name_b = 'smyth'
print('Matching rating approach {0}'.format(td.mra.normalized_similarity(first_name_a,first_name_b)))
print('Matching rating approach {0}'.format(td.mra.normalized_similarity(last_name_a,last_name_b)))
'''soundex'''
'''jellyfish, fuzzy look better but need a c/c++ compiler'''
import metaphone as mp
def mp_similarity(str1, str2):
mp1 = mp.doublemetaphone(str1)
mp2 = mp.doublemetaphone(str2)
return ((mp1[0]==mp2[0]) + (mp1[1]==mp2[1])) / 2.0
print(mp.doublemetaphone(first_name_a))
print(mp.doublemetaphone(first_name_b))
print(mp.doublemetaphone(last_name_a))
print(mp.doublemetaphone(last_name_b))
print('double metaphone similarity {0}'.format(mp_similarity(first_name_a, first_name_b)))
print('double metaphone similarity {0}'.format(mp_similarity(last_name_a, last_name_b)))
'''############## phone number ##################'''
phone_a = '+61 433 500 123'
phone_b = '0433-500-123'
..todo