# remove everything except digits, alphabet, space and "\'"
transcript = re.sub(r'[^a-zA-Z0-9\' ]+', ' ', transcript)
# squeeze extra spaces
transcript = re.sub("\s\s+", " ", transcript.strip())
# extract (consecutive) numbers not followed by digit or alphabets
finds = re.findall(r'\d+\s*\d+(?![a-z\d])', transcript)
# extract numbers following "|", including the optional sign "-"
finds = re.findall("\|(-*\d+\.\d)", transcript)
# extract index of numbers followed by alphabets
find_idx = re.finditer(r'\d+(?=[a-z])', transcript)
for item in temp:
start = item.start(0)
end = item.end(0)
# remove punctuations
import string
# punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
punc = '''!()-[]{};:"\,<>./?@#$%^&*_~'''
new_string = old_string.translate(str.maketrans('', '', punc))
# new_string = old_string.translate(str.maketrans('', '', string.punctuation))
# find all files recursively under a folder ending with .wav
import glob, os
import pandas as pd
import subprocess
import csv
import json
# for f in glob.glob(audio_folder+'/**/*.mp3', recursive=True): # python 3 only
for r, d, f in os.walk(audio_folder):
for file in f:
if file.split(".")[-1] in ["wav", "mp3", "m4a", "aac"]:
audio_path = os.path.join(r, file)
# normalise
if not os.path.isfile(output_name):
subprocess.call(
['ffmpeg', '-i', original_audio_path, '-ac', '1', '-ar', '16000', '-acodec', 'pcm_s16le', output_name])
# load the excel file
df = pd.read_excel(excel_name, engine='openpyxl', skip_blank_lines=False)
anno_dict = dict()
for row_ind, row_cont in df.iterrows():
pass
# write to csv file
fout = open(saveName, "w", encoding="utf-8")
line = "wav_filename,wav_filesize,transcript,phoneme,AccComFluProTotal\n"
fout.truncate()
fout.write(line)
# load the csv file
with open(annotation_file, encoding="utf8") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader)
for row in reader:
pass
# Write to pickle file
with open("features_word_scoring.pkl", 'wb') as f:
pickle.dump(word_features, f)
pickle.dump(word_labels, f)
# load the pickle file
with open(feature_file, 'rb') as fid:
word_features = pickle.load(fid)
word_labels = pickle.load(fid)
# write to json file
with open("aaaa.json", "w") as outfile:
json.dump(score_all, outfile, indent=2)
json.dump(anno_dict, tf, default=str, ensure_ascii=False, indent=2)
# load json file
with open(filename) as f:
info = json.load(f)
# write to txt file
f_sentence_analysis = open("sentence_analysis.txt", 'wt')
temp_str = "total-{0:.1f}, acc-{1:.1f},\t com-{2:.1f},\t flu-{3:.1f}\n".format(a, b, c, d])
f_sentence_analysis.write(temp_str)
# read txt file
with open('abc.txt') as f:
lines = f.readlines()
for line in lines:
pass
# Find all direct subfolders and sort the order
speakers_list = glob.glob(f"{self.in_dir}/*/", recursive=True)
speakers_list = [os.path.basename(os.path.normpath(x)) for x in speakers_list]
speakers_list = sorted(speakers_list, key=lambda i: (i[0], int(i[1:])))