Some RegEx Patterns

# remove everything except digits, alphabet, space and "\'"

transcript = re.sub(r'[^a-zA-Z0-9\' ]+', ' ', transcript)

# squeeze extra spaces

transcript = re.sub("\s\s+", " ", transcript.strip())

# extract (consecutive) numbers not followed by digit or alphabets

finds = re.findall(r'\d+\s*\d+(?![a-z\d])', transcript)

# extract numbers following "|", including the optional sign "-"

finds = re.findall("\|(-*\d+\.\d)", transcript)

# extract index of numbers followed by alphabets

find_idx = re.finditer(r'\d+(?=[a-z])', transcript)

for item in temp:

start = item.start(0)

end = item.end(0)

# remove punctuations

import string
# punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

punc = '''!()-[]{};:"\,<>./?@#$%^&*_~'''

new_string = old_string.translate(str.maketrans('', '', punc))

# new_string = old_string.translate(str.maketrans('', '', string.punctuation))

Read and Write

# find all files recursively under a folder ending with .wav

import glob, os

import pandas as pd

import subprocess

import csv

import json

# for f in glob.glob(audio_folder+'/**/*.mp3', recursive=True): # python 3 only

for r, d, f in os.walk(audio_folder):

for file in f:

if file.split(".")[-1] in ["wav", "mp3", "m4a", "aac"]:

audio_path = os.path.join(r, file)

# normalise

if not os.path.isfile(output_name):

subprocess.call(

['ffmpeg', '-i', original_audio_path, '-ac', '1', '-ar', '16000', '-acodec', 'pcm_s16le', output_name])

# load the excel file

df = pd.read_excel(excel_name, engine='openpyxl', skip_blank_lines=False)

anno_dict = dict()

for row_ind, row_cont in df.iterrows():

pass

# write to csv file

fout = open(saveName, "w", encoding="utf-8")

line = "wav_filename,wav_filesize,transcript,phoneme,AccComFluProTotal\n"

fout.truncate()

fout.write(line)

# load the csv file

with open(annotation_file, encoding="utf8") as csvfile:

reader = csv.reader(csvfile, delimiter=',')

next(reader)

for row in reader:

pass

# Write to pickle file

with open("features_word_scoring.pkl", 'wb') as f:

pickle.dump(word_features, f)

pickle.dump(word_labels, f)

# load the pickle file

with open(feature_file, 'rb') as fid:

word_features = pickle.load(fid)

word_labels = pickle.load(fid)

# write to json file

with open("aaaa.json", "w") as outfile:

json.dump(score_all, outfile, indent=2)

json.dump(anno_dict, tf, default=str, ensure_ascii=False, indent=2)

# load json file

with open(filename) as f:

info = json.load(f)

# write to txt file

f_sentence_analysis = open("sentence_analysis.txt", 'wt')

temp_str = "total-{0:.1f}, acc-{1:.1f},\t com-{2:.1f},\t flu-{3:.1f}\n".format(a, b, c, d])

f_sentence_analysis.write(temp_str)

# read txt file

with open('abc.txt') as f:

lines = f.readlines()

for line in lines:

pass

# Find all direct subfolders and sort the order

speakers_list = glob.glob(f"{self.in_dir}/*/", recursive=True)

speakers_list = [os.path.basename(os.path.normpath(x)) for x in speakers_list]

speakers_list = sorted(speakers_list, key=lambda i: (i[0], int(i[1:])))

Page updated

Google Sites

Report abuse