Bash to python
Pattern
Machine learning
Forced aligner
Praat
Python
import readlinefor i in range(readline.get_current_history_length()): print readline.get_history_item(i)Alias
from nltk.corpus.reader import framenet as fnModule version
>>> nltk.__version__'3.0.0b1'Dict entities have keywords and entries
>>> myDict = st.get_entities(sentence){u'ORGANIZATION': [u'University of California'], u'LOCATION': [u'California', u'United States'],u'O': [u'is located in', u',']}To unfold, use .items():
>>> for tup in myDict.items():... print tup... (u'ORGANIZATION', [u'University of California'])(u'LOCATION', [u'California', u'United States'])(u'O', [u'is located in', u','])Expand a list
Each element in myDict is a tuple; each tuple in turn is a unicode string and a list:
>>> type(tup[0])<type 'unicode'>>>> type(tup[1])<type 'list'>To expand that list:
print " ".join(tup[1])Convert a string s to a tuple t
t = (s,)Convert tuples to list -- one tuple at a time
l = list(t[0])l1 = l[0:2] # where to start : how manyConvert tuples to strings -- all at once
s = str(t)Split a tuple into a list
field = filename.split("_")Assign directly and count words
text = line.split("|")[3]WC = WC + len(text.split())Assign by subtraction
fdate = field.pop(0)fhour = field.pop(0)country = field.pop(0)network = field.pop(0)show = fieldStrip a newline from a string
sentence_sub = fields.pop(0).rstrip()Split string into tuples (chunks) of 3
fields = ["can't", '-0.1', '0.1', 'modern', '0.2', '0.3']zip(*[fields[i::3] for i in range(3)])[("can't", '-0.1', '0.1'), ('modern', '0.2', '0.3')]Concatenate strings
text += str(nltk.tag.tuple2str(tagged_text[x]) + " ")Join a list with an underscore or space as delimiter
show = "_".join(field)phrase = " ".join(tup[0]) # If tub[0] only contains one word, no space is addedprint "".join([stem,",SMT_01",",",str(tup[0]),",",str(tup[1]),",",str(tup[2]).rstrip()]) # strip newlineJoint a list, keeping the UTF-8 encoding, and replacing spaces with pipe symbols
snt = parse(text, lemmata=True, relations=True)text = re.sub('\ ', '|', snt)if snt != "": print u"".join([field[0],"|",field[1],"|POS_03|",text]).encode('utf-8').strip()Check for substring in string
if "SMT_" not in line: continueData type
>>> type(fff)<class 'nltk.corpus.reader.framenet.PrettyList'>>>> type(fff[0])<class 'nltk.corpus.reader.framenet.AttrDict'>Modules
Time now
datetime.datetime.now().strftime("%Y-%m-%d %H:%M")Remove parens from a string s
import re re.sub('[()]', '', s)Remove brackets and single quotes from tuple t -- all elements
b = str(t).replace('[','').replace(']','').replace("'",'')Clean up unicode that halts stanford-ner, MBSP, et al
text = re.sub('^[>,\ ]{0,6}', '', field[3]) text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd', '').replace('\xb6','').replace('\xa9','') text = str(text).replace('\xc3\xaf', '').replace('\x5c','').replace('\xf1','').replace('\xe2\x99\xaa','')Get lines from file in utf-8
import codecswith codecs.open(filename,encoding='utf8') as fp: for line in fp: print line.encode('utf8')Or without specifying character encoding
with open(filename) as fp: for line in fp: # Split each line into fields field = line.split("|") # Pretty debug print('\n'.join('{}: {}'.format(*k) for k in enumerate(field)))For loop -- get separate tuples from the parse function and make a replacement in the first element before printing
tups = sentiment(line).assessments for tup in tups: a = str(tup[0]).replace('[','').replace(']','').replace("'",'') smt = [a," (",str(tup[1]),", ",str(tup[2]),")"] print "".join(smt), print "\r"Re-written to fluent python (see SentiWordNet.py):
for tup in sentiment(sentence).assessments: words = " ".join(tup[0]) terms = "".join([terms,",",words,",",str(tup[1]),",",str(tup[2])]) print termsCatch failure
try: sentence = field[3] except IndexError: print line continueEndless
while True:Test string and string length
if field[2] == "SEG": print line, continue elif len(field[2]) != 3: print line, continueCompound conditions with parens
if ( network == 'CampaignAds' ) or ( network == 'Shooters' ) or ( network == 'DigitalEphemera' ): continueSubstring test
if "POS_01" in line:Test empty string
if not myString: if myString == "":Skip if the line is empty
if text.strip() == '' : continue import reTypes
re.match re.search re.subSkip header files
if not re.match("([0-9]{14})", line): continueTeletext page
if re.search("(\|[0-9]{3}\|)", line)Retrieve matching substring within a text
>>> a=str(fn.frame(114)) >>> b = re.search("Core\:\ [A-Za-z_]*\ \([0-9]{1,6}\)", a) >>> b.group() 'Core: Hypothetical_event (563)' >>> if b: print b.group()You can turn it into a one-liner for a single hit:
COR = re.search("Core\:\ \w*\ \(\d*\)", str(fn.frame(ID))).group()Ingest old-style SMT_ lines from .seg files
line=' very, serious (-0.33, 0.67) failure (-0.316666666667, 0.3)'pattern = '\ ?[a-zA-Z]{1,99},?\ ?[a-zA-Z]{1,99}?\ \(-?\d\.[0-9]{1,12},\ -?\d\.[0-9]{1,12}\)'for match in re.finditer(pattern, line): s = match.start() e = match.end() print (line[s:e])very, serious (-0.33, 0.67)failure (-0.316666666667, 0.3)Or more compact:
for match in re.finditer(pattern, line): print (line[match.start():match.end()])Faster than grep in file fp -- output all lines
term = "ALASKA" print ''.join((line for line in fp if term in line))Fast egrep in file fp
pat = re.compile("^([A-Z][0-9]+)*$") print sum(1 for line in fp if pat.search(line)) print ''.join((line for line in fp if pat.search(line))Either use the 2.7 method (a for append or w for overwrite):
file=open('filename.txt','w') file.write('some text')Or the backported 3.x method (a for append or w for overwrite):
from __future__ import print_function with open('filename.txt', 'a') as f: print("hi there", file=f)Print without newline
print "hi",-- but this adds a trailing space. To avoid it:
import sys for i in range(10): sys.stdout.write("*") sys.stdout.write("\n")Test installation from commandline
python -m 'rpy2.robjects.tests.__init__'