Big Python, artificial words

This is not for student use, really. This code is storage for a significant piece of coding I have done.

By using the text string that is three books, about 1.2MB, I can mine the text data for the words that are used, 12,305 of them when you exclude numbers. I can analyze how normal words start (the first two characters) and end. I can see what two-character sequences are normal inside of words. Having done this, I can manufacture artifical or fake words that have some level of similarity to real words. What good is this? It is interesting to see how pronounceable the fake words are.

# three books v9 from long file common char pairs fake word v3.py JE Oct 15 2024

# ver3 generates several fake words

# the books are not embedded within the code, instead they are read from drive. Having the long text embedded

# was slowing down code editing a lot.

# 3 books m mysterious island f flying the atlantic a alice in wonderland

# what are the most common character pairs to start words?

import time, random, string

print("m is The Mysterious Island by Jules Verne\n","a is Alice's Adventures in Wonderland by Lewis Carroll\n","f is Flying the Atlantic in Sixteen Hours, non-fiction")

print('seconds from epoch start',str(time.time())[:12],'whatever that is good for'); time.sleep(1)

print('Objective of Python code: What are the most common character pairs that start words?')

f10 = open('threeBksTxt.txt','r')

f12=f10.readlines() #readlines is important, it makes a list

f10.close();print('file was read from drive');time.sleep(.8)

print('f12 is a Python "list," the lines of text. There are',len(f12), 'lines of text')

alltext=' '.join(f12) #join all the list's items into one big string with space separator

print('length of alltext, one long string made of 3 books',len(alltext), 'characters');time.sleep(1.2)

print('part of alltext ',alltext[300000:302000],'\n')

alltext = alltext.lower()

for j in ['"','#','$',"'",'(',')','*','.',',',';','-','/','[',']','%','&','+','—','”','“','’','‘','?','!',':','œ','ë','æ','á','é', 'xi', 'xl', 'xv', 'xx','ö','î','ê']:

alltext = alltext.replace(j,'')

print('length of alltext after trimming out punctuation',len(alltext))

allList = alltext.split() #get a list of all words including repeated words like and, or, but

allListSort = sorted(allList)

print('words in allListSort and the number of words',len(allListSort),allListSort)

sortedSet1 = sorted(set(allListSort)) #making it a set eliminates duplicates

print('the sorted set of words without duplications and the type of sortedSet1',type(sortedSet1),sortedSet1)

longerWords = [] #a list is going to be made from the set

for i in sortedSet1:

if len(i)>1 and len(i)<12: #exclude very long words including 'words' that seem to be two words without intervening space

longerWords.append(i)

print('Number of different words at least 2 characters long:',len(longerWords),longerWords)

allListNoNum=[]

for k in longerWords:

#eliminate numbers

k1=str(k)

thereIsA_Number = False

for m1 in range (0,len(k1)):

m=k1[m1]

if m=='0' or m=='1' or m=='2' or m=='3' or m=='4' or m=='5' or m=='6' or m=='7' or m=='8' or m=='9':

thereIsA_Number=True

if thereIsA_Number==False:

allListNoNum.append(k1)

print('allListNoNum--------------',allListNoNum)

#find initial 2 char in these words

first2charsList=[]

for n in allListNoNum:

nstr=n[:2]

first2charsList.append(nstr)

first2charsList=sorted(set(first2charsList))

removeCount=0

removeThese = ['ds', 'dw', 'ei', 'ft', 'hj', 'hm', 'ii', 'ix', 'kb', 'lt', 'lv', 'lx', 'mt', 'nf', 'nj', 'pg', 'pm', 'ps', 'pt', 'rf', 'tm', 'yp']

for item in removeThese:

if item in first2charsList:

first2charsList.remove(item)

removeCount += 1

print('Some unlikely first two chars removed ',removeCount);time.sleep(2)

print('how many diff double chars start words?',len(first2charsList),' 1st 2 chars in words',first2charsList)

#find last 2 char in these words

last2charsList=[]

for n in allListNoNum:

nstr=n[-2]+n[-1]

last2charsList.append(nstr)

last2charsList=sorted(set(last2charsList))

print('how many diff double chars terminate words?',len(last2charsList),' last 2 chars in words',last2charsList)

#find 2-char sequences in the middles of words

mid2charsList=[]

#testwords=['abcdefghij','mnortvxyz']

for p in allListNoNum:

for r in range (1,len(p)-2):

mid2charsList.append(p[r]+p[r+1])

mid2charsList=sorted(set(mid2charsList))

print('+++mid2charsList++++++++++++++','how many diff 2-char sequences are in middles of words?',len(mid2charsList),'mid 2 chars list',mid2charsList)

# this much seems to be working. Now, can code go on to compose nonsense words that have

# realistic 2-char sequences?

vowelsList = ['a','e','i','o','u','y']

severalFakeWords = []

for fg in range (0,10): #program hangs up if I go to 12

eLen=random.randint(4,13)

eIndex=random.randint(0,len(first2charsList)-2) #should it be -2 or -1?

eStr=first2charsList[eIndex]

#print(eIndex,eStr)

for et in range(1,eLen-2): #find a mid pair of char starting with eStr[1], put its 2nd char onto eStr

eu=eStr[-1] #the last char in eStr

ev2chars= mid2charsList[random.randint(0,len(mid2charsList)-2)]

while eu != ev2chars[0]: # not efficient but gets it done, both sides of != are single chars

euIndex=random.randint(0,len(mid2charsList)-2)

ev2chars=mid2charsList[euIndex]

#print(eStr,eu,ev2chars)

eStr= eStr+ev2chars[1]

# print(eStr)

#find a last char from last2charsList

eu=eStr[-1] #the last char in eStr

ev2chars= last2charsList[random.randint(0,len(last2charsList)-2)]

while eu != ev2chars[0]:

euIndex=random.randint(0,len(last2charsList)-2)

ev2chars=last2charsList[euIndex]

# print('for the last char',eStr,eu,ev2chars) #by this line of code, an appropriate last char is found

eStr= eStr+ev2chars[1]

eStrOrig=eStr

# print('an artificial word is complete but needs a vowel scan:',eStr)

#vowel scan looks for any consonant-only substring of 3 or more consonants and puts in a vowel

# so that there are no more than 2 consonants in a row. Equivalently, look at 2 adjacent chars

# and randomly (50% yes, 50% skip) put a vowel between. But this will let some 3-consonant

# substrings through. But try it anyway.

for fc in range (2,14):

fd=fc

if fd>len(eStr)-2:

fd = len(eStr)-2

#print('++',fd, len(eStr))

if (eStr[fd] not in vowelsList) and (eStr[fd+1] not in vowelsList) and (random.randint(0,11)>5) :

#stick a vowel between the 2 consonants

#print('Stick in a vowel')

eyy=random.randint(0,len(vowelsList)-1) #-1 or 0?

#print('3 parts ',eStr[:fd+1],vowelsList[eyy],eStr[(fd+1):])

eStr= eStr[:fd+1]+vowelsList[eyy]+eStr[(fd+1):]

#else:

#print("Don't stick in a vowel")

#print(eStr, fc,fd)

#print('>>>>>>>>>',eStr,'\n original',eStrOrig)

#not perfect but is better than without vowel scan

severalFakeWords.append(eStr)

print('several Fake Words from Pyton three books v9 from long file common char pairs fake word v3.py')

for fq in severalFakeWords:

print(fq)

#examples

# sngkdge

# oskefyycequyd

# umhihmer

# neeirlryjobthi

# nihbajesunyd

# elwukalohasihahi

# edcokukauwoù

# phfefycehhugyfs

# ante

# izbyncuas

# upyjewawebme

# cllymdonquvy

# iooad

# zine

# aexeefusadd

# sumidubyfof

# myjoh

# zetcafeacys

# xidededizi

# yihlagom

# etom

# bybvezyllufyway

# kneix

# gnuzorjuses

# pluftado

# luaarhacotydiguh

# ottelehesyhyyn

# illywoynyjau

# waogasedlir

# eftdokugozut

# snvaait

# gobeyosal

Most of those can be pronounced. It helps to be flexible as you form syllables. Pluftado is a favorite.

The number of all possible char pairs including aa and zz is 26*26=676. Many of these just aren't used in real words. More like 228 are realistic at the start of words, and 275 at the end.

228 char pairs are allowed to start words This is after deleting 22 pairs that somehow sneak in. ['ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'al', 'am', 'an', 'ao', 'ap', 'aq', 'ar', 'as', 'at', 'au', 'av', 'aw', 'ax', 'ay', 'az', 'ba', 'be', 'bi', 'bl', 'bo', 'br', 'bu', 'by', 'ca', 'ce', 'ch', 'ci', 'cl', 'co', 'cr', 'cu', 'cy', 'da', 'de', 'di', 'do', 'dr', 'dt', 'du', 'dy', 'ea', 'eb', 'ec', 'ed', 'ee', 'ef', 'eg', 'ej', 'el', 'em', 'en', 'ep', 'eq', 'er', 'es', 'et', 'eu', 'ev', 'ex', 'ey', 'fa', 'fe', 'fi', 'fl', 'fo', 'fr', 'fu', 'ga', 'ge', 'gi', 'gl', 'gn', 'go', 'gr', 'gu', 'gy', 'ha', 'he', 'hi', 'ho', 'hu', 'hy', 'ic', 'id', 'if', 'ig', 'il', 'im', 'in', 'io', 'ir', 'is', 'it', 'iv', 'iz', 'ja', 'je', 'ji', 'jo', 'ju', 'ka', 'ke', 'ki', 'kl', 'kn', 'ko', 'kr', 'la', 'le', 'li', 'll', 'lo', 'lu', 'ly', 'ma', 'me', 'mi', 'mo', 'mr', 'mu', 'my', 'na', 'ne', 'ni', 'no', 'nu', 'oa', 'ob', 'oc', 'od', 'of', 'og', 'oh', 'oi', 'ok', 'ol', 'om', 'on', 'oo', 'op', 'or', 'os', 'ot', 'ou', 'ov', 'ow', 'ox', 'oy', 'où', 'pa', 'pe', 'ph', 'pi', 'pl', 'po', 'pr', 'pu', 'py', 'qu', 'ra', 're', 'rh', 'ri', 'ro', 'ru', 'sa', 'sc', 'se', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sp', 'sq', 'st', 'su', 'sw', 'sy', 'ta', 'te', 'th', 'ti', 'to', 'tr', 'tu', 'tw', 'ty', 'ud', 'ug', 'ul', 'um', 'un', 'up', 'ur', 'us', 'ut', 'va', 've', 'vi', 'vo', 'vu', 'wa', 'we', 'wh', 'wi', 'wo', 'wr', 'xe', 'xi', 'ya', 'ye', 'yi', 'yo', 'za', 'ze', 'zi', 'zo']

275 char pairs terminate words I have not edited this for sneaky errors creeping in ['aa', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'ak', 'al', 'am', 'an', 'ap', 'ar', 'as', 'at', 'au', 'aw', 'ax', 'ay', 'ba', 'bb', 'be', 'bi', 'bo', 'br', 'bs', 'bt', 'by', 'ca', 'cc', 'ce', 'ch', 'ck', 'co', 'cs', 'ct', 'cu', 'cy', 'da', 'dd', 'de', 'di', 'do', 'ds', 'dy', 'ea', 'eb', 'ec', 'ed', 'ee', 'ef', 'eg', 'ei', 'ek', 'el', 'em', 'en', 'eo', 'ep', 'er', 'es', 'et', 'ew', 'ex', 'ey', 'ez', 'fa', 'fc', 'fe', 'ff', 'fs', 'ft', 'fy', 'ga', 'ge', 'gg', 'gh', 'gi', 'gn', 'go', 'gs', 'gy', 'ha', 'he', 'hi', 'hl', 'hm', 'hn', 'ho', 'hs', 'ht', 'hy', 'ia', 'ib', 'ic', 'id', 'ie', 'if', 'ig', 'ii', 'il', 'im', 'in', 'io', 'ip', 'ir', 'is', 'it', 'iv', 'ix', 'jo', 'ka', 'ke', 'ks', 'ky', 'la', 'lb', 'ld', 'le', 'lf', 'li', 'lk', 'll', 'lm', 'ln', 'lo', 'lp', 'ls', 'lt', 'lv', 'lx', 'ly', 'ma', 'mb', 'me', 'mi', 'mn', 'mo', 'mp', 'mr', 'ms', 'mt', 'my', 'na', 'nc', 'nd', 'ne', 'nf', 'ng', 'ni', 'nj', 'nk', 'nn', 'no', 'ns', 'nt', 'nu', 'ny', 'ob', 'od', 'oe', 'of', 'og', 'oh', 'ok', 'ol', 'om', 'on', 'oo', 'op', 'or', 'os', 'ot', 'ou', 'ow', 'ox', 'oy', 'où', 'pa', 'pe', 'pg', 'ph', 'pl', 'po', 'ps', 'pt', 'py', 'ra', 'rb', 'rc', 'rd', 're', 'rf', 'rg', 'rh', 'ri', 'rk', 'rl', 'rm', 'rn', 'ro', 'rp', 'rs', 'rt', 'rv', 'ry', 'sa', 'sc', 'se', 'sh', 'si', 'sk', 'sl', 'sm', 'sn', 'so', 'sp', 'sq', 'ss', 'st', 'sy', 'ta', 'tc', 'td', 'te', 'th', 'ti', 'tm', 'to', 'ts', 'tt', 'ty', 'tz', 'ub', 'ud', 'ue', 'ug', 'uk', 'ul', 'um', 'un', 'up', 'ur', 'us', 'ut', 'ux', 'va', 've', 'vi', 'vy', 'wa', 'wd', 'we', 'wl', 'wn', 'wo', 'ws', 'wy', 'xe', 'xi', 'xt', 'xy', 'ya', 'yd', 'ye', 'yi', 'yn', 'yo', 'ys', 'ze', 'zi', 'zo', 'zy']

498 char pairs are found to be in the interior of words. Many look doubtful but may be valid between syllables. ['aa', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'ah', 'ai', 'aj', 'ak', 'al', 'am', 'an', 'ao', 'ap', 'aq', 'ar', 'as', 'at', 'au', 'av', 'aw', 'ax', 'ay', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bi', 'bj', 'bl', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bu', 'bv', 'by', 'ca', 'cc', 'ce', 'ch', 'ci', 'ck', 'cl', 'cm', 'cn', 'co', 'cq', 'cr', 'ct', 'cu', 'cy', 'da', 'db', 'dc', 'dd', 'de', 'df', 'dg', 'dh', 'di', 'dj', 'dk', 'dl', 'dm', 'dn', 'do', 'dp', 'dr', 'ds', 'dt', 'du', 'dv', 'dw', 'dy', 'dz', 'ea', 'eb', 'ec', 'ed', 'ee', 'ef', 'eg', 'eh', 'ei', 'ej', 'ek', 'el', 'em', 'en', 'eo', 'ep', 'eq', 'er', 'es', 'et', 'eu', 'ev', 'ew', 'ex', 'ey', 'ez', 'fa', 'fb', 'fc', 'fd', 'fe', 'ff', 'fh', 'fi', 'fj', 'fl', 'fm', 'fn', 'fo', 'fp', 'fr', 'fs', 'ft', 'fu', 'fw', 'fy', 'ga', 'gb', 'gc', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gk', 'gl', 'gm', 'gn', 'go', 'gp', 'gr', 'gs', 'gt', 'gu', 'gw', 'gy', 'gz', 'ha', 'hb', 'hc', 'hd', 'he', 'hf', 'hg', 'hh', 'hi', 'hj', 'hl', 'hm', 'hn', 'ho', 'hp', 'hq', 'hr', 'hs', 'ht', 'hu', 'hw', 'hy', 'ia', 'ib', 'ic', 'id', 'ie', 'if', 'ig', 'ih', 'ii', 'ik', 'il', 'im', 'in', 'io', 'ip', 'iq', 'ir', 'is', 'it', 'iu', 'iv', 'iw', 'ix', 'iz', 'ja', 'jc', 'je', 'ji', 'jo', 'ju', 'ka', 'kd', 'ke', 'kf', 'kg', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', 'ko', 'kp', 'kr', 'ks', 'kt', 'ku', 'kw', 'ky', 'la', 'lb', 'lc', 'ld', 'le', 'lf', 'lg', 'lh', 'li', 'lk', 'll', 'lm', 'ln', 'lo', 'lp', 'lr', 'ls', 'lt', 'lu', 'lv', 'lw', 'ly', 'ma', 'mb', 'mc', 'md', 'me', 'mf', 'mh', 'mi', 'mj', 'ml', 'mm', 'mn', 'mo', 'mp', 'mr', 'ms', 'mt', 'mu', 'mw', 'my', 'na', 'nb', 'nc', 'nd', 'ne', 'nf', 'ng', 'nh', 'ni', 'nj', 'nk', 'nl', 'nm', 'nn', 'no', 'np', 'nq', 'nr', 'ns', 'nt', 'nu', 'nv', 'nw', 'ny', 'nz', 'oa', 'ob', 'oc', 'od', 'oe', 'of', 'og', 'oh', 'oi', 'oj', 'ok', 'ol', 'om', 'on', 'oo', 'op', 'oq', 'or', 'os', 'ot', 'ou', 'ov', 'ow', 'ox', 'oy', 'oz', 'pa', 'pb', 'pc', 'pd', 'pe', 'pg', 'ph', 'pi', 'pk', 'pl', 'pm', 'pn', 'po', 'pp', 'pr', 'ps', 'pt', 'pu', 'pw', 'py', 'qu', 'ra', 'rb', 'rc', 'rd', 're', 'rf', 'rg', 'rh', 'ri', 'rj', 'rk', 'rl', 'rm', 'rn', 'ro', 'rp', 'rq', 'rr', 'rs', 'rt', 'ru', 'rv', 'rw', 'ry', 'sa', 'sb', 'sc', 'sd', 'se', 'sf', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sp', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'sy', 'ta', 'tb', 'tc', 'td', 'te', 'tf', 'tg', 'th', 'ti', 'tj', 'tl', 'tm', 'tn', 'to', 'tp', 'tr', 'ts', 'tt', 'tu', 'tw', 'ty', 'tz', 'ua', 'ub', 'uc', 'ud', 'ue', 'uf', 'ug', 'uh', 'ui', 'uk', 'ul', 'um', 'un', 'uo', 'up', 'ur', 'us', 'ut', 'uv', 'uw', 'ux', 'uy', 'uz', 'va', 've', 'vi', 'vo', 'vr', 'vu', 'vy', 'wa', 'wb', 'wc', 'wd', 'we', 'wf', 'wh', 'wi', 'wj', 'wk', 'wl', 'wm', 'wn', 'wo', 'wp', 'wr', 'ws', 'wt', 'wu', 'ww', 'xa', 'xc', 'xe', 'xh', 'xo', 'xp', 'xs', 'xt', 'xu', 'xw', 'xy', 'ya', 'yb', 'yc', 'yd', 'ye', 'yf', 'yg', 'yh', 'yi', 'yj', 'yl', 'ym', 'yn', 'yo', 'yp', 'yr', 'ys', 'yt', 'yu', 'yv', 'yw', 'yy', 'yz', 'za', 'zb', 'ze', 'zi', 'zl', 'zo', 'zu', 'zv', 'zz']

Page updated

Google Sites

Report abuse