thai romanization



https://www.loc.gov/catdir/cpso/romanization/thai.pdf

from __future__ import absolute_import,unicode_literals,print_function


from pythainlp.tokenize import word_tokenize

import numpy as np

#import keras

#from pythainlp.corpus import get_file,download


#from keras.models import Model, load_model

#from keras.layers import Input

from pythainlp.transliterate import romanize


# sudo apt-get install pkg-config libicu-dev

# pip install --no-binary=:pyicu: pyicu

#text='ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด'

#a=word_tokenize(text,engine='icu') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอ', 'เค', 'บ่', 'พวก', 'เรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้าน', 'เกิด']

#b=word_tokenize(text,engine='mm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']

#c=word_tokenize(text,engine='pylexto') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']

#d=word_tokenize(text,engine='newmm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']

#e=word_tokenize(text,engine='wordcutpy') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้านเกิ

text = "สถาบันบัณฑิตพัฒนบริศาสตร์"

print(word_tokenize(text, engine="newmm")) #['สถาบันบัณฑิตพัฒนบริหารศาสตร์']

print(word_tokenize(text, engine="icu")) #['สถาบัน', 'บัณฑิต', 'พัฒนบริหารศาสตร์']

print(romanize("แมว")) # output: 'maeo'

print(romanize("เกรียง"))

print(romanize("เกรียง",engine="icu"))

print(romanize("เกรียง",engine="newmm"))

print(romanize("เกรียง",engine="mm"))

print(romanize("ศักดิ์"))

class thai2rom:

def __init__(self):

'''

Thai2Rom

'''

self.batch_size = 64

self.epochs = 100

self.latent_dim = 256

self.num_samples = 648241

self.data_path = get_file('thai2rom-dataset')

if self.data_path==None:

download('thai2rom-dataset')

self.data_path = get_file('thai2rom-dataset')

self.input_texts = []

self.target_texts = []

self.input_characters = set()

self.target_characters = set()

with open(self.data_path, 'r', encoding='utf-8-sig') as self.f:

self.lines = self.f.read().split('\n')

for self.line in self.lines[: min(self.num_samples, len(self.lines) - 1)]:

self.input_text, self.target_text = self.line.split('\t')

if len(self.input_text)<30 and len(self.target_text)<90:

self.target_text = '\t' + self.target_text + '\n'

self.input_texts.append(self.input_text)

self.target_texts.append(self.target_text)

for self.char in self.input_text:

if self.char not in self.input_characters:

self.input_characters.add(self.char)

for self.char in self.target_text:

if self.char not in self.target_characters:

self.target_characters.add(self.char)

self.input_characters = sorted(list(self.input_characters))

self.target_characters = sorted(list(self.target_characters))

self.num_encoder_tokens = len(self.input_characters)

self.num_decoder_tokens = len(self.target_characters)

self.max_encoder_seq_length = max([len(self.txt) for self.txt in self.input_texts])

self.max_decoder_seq_length = max([len(self.txt) for self.txt in self.target_texts])

'''print('Number of samples:', len(self.input_texts))

print('Number of unique input tokens:', self.num_encoder_tokens)

print('Number of unique output tokens:', self.num_decoder_tokens)

print('Max sequence length for inputs:', self.max_encoder_seq_length)

print('Max sequence length for outputs:', self.max_decoder_seq_length)'''

self.input_token_index = dict([(char, i) for i, char in enumerate(self.input_characters)])

self.target_token_index = dict([(char, i) for i, char in enumerate(self.target_characters)])

self.encoder_input_data = np.zeros((len(self.input_texts), self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32')

for i, input_text in enumerate(self.input_texts):

for t, char in enumerate(self.input_text):

self.encoder_input_data[i, t, self.input_token_index[char]] = 1.

# Restore the model and construct the encoder and decoder.

self.filemodel=get_file('thai2rom')

if self.filemodel==None:

download('thai2rom')

self.filemodel=get_file('thai2rom')

self.model = load_model(self.filemodel)

self.encoder_inputs = self.model.input[0] # input_1

self.encoder_outputs, self.state_h_enc, self.state_c_enc = self.model.layers[2].output # lstm_1

self.encoder_states = [self.state_h_enc, self.state_c_enc]

self.encoder_model = Model(self.encoder_inputs, self.encoder_states)

self.decoder_inputs = self.model.input[1] # input_2

self.decoder_state_input_h = Input(shape=(self.latent_dim,), name='input_3')

self.decoder_state_input_c = Input(shape=(self.latent_dim,), name='input_4')

self.decoder_states_inputs = [self.decoder_state_input_h, self.decoder_state_input_c]

self.decoder_lstm = self.model.layers[3]

self.decoder_outputs, self.state_h_dec, self.state_c_dec = self.decoder_lstm(self.decoder_inputs, initial_state=self.decoder_states_inputs)

self.decoder_states = [self.state_h_dec, self.state_c_dec]

self.decoder_dense = self.model.layers[4]

self.decoder_outputs = self.decoder_dense(self.decoder_outputs)

self.decoder_model = Model([self.decoder_inputs] + self.decoder_states_inputs,[self.decoder_outputs] + self.decoder_states)


self.reverse_input_char_index = dict((i, char) for char, i in self.input_token_index.items())

self.reverse_target_char_index = dict((i, char) for char, i in self.target_token_index.items())

def decode_sequence(self,input_seq):

self.states_value = self.encoder_model.predict(input_seq)

self.target_seq = np.zeros((1, 1, self.num_decoder_tokens))

self.target_seq[0, 0, self.target_token_index['\t']] = 1.

self.stop_condition = False

self.decoded_sentence = ''

while not self.stop_condition:

self.output_tokens, self.h, self.c = self.decoder_model.predict([self.target_seq] + self.states_value)

self.sampled_token_index = np.argmax(self.output_tokens[0, -1, :])

self.sampled_char = self.reverse_target_char_index[self.sampled_token_index]

self.decoded_sentence += self.sampled_char

if (self.sampled_char == '\n' or len(self.decoded_sentence) > self.max_decoder_seq_length):

self.stop_condition = True

self.target_seq = np.zeros((1, 1, self.num_decoder_tokens))

self.target_seq[0, 0, self.sampled_token_index] = 1.

self.states_value = [self.h, self.c]

return self.decoded_sentence

def encode_input(self,name):

self.test_input = np.zeros((1, self.max_encoder_seq_length, self.num_encoder_tokens),dtype='float32')

for t, char in enumerate(name):

self.test_input[0, t, self.input_token_index[char]] = 1.

return self.test_input


def romanization(data,engine='royin'):

#def romanization(data,engine='thai2rom'):

"""

:param str data: Thai text to be romanized

:param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization.

:return: English (more or less) text that spells out how the Thai text should read.

"""

listword=[]

if engine=='royin':

from .royin import romanize

elif engine=='pyicu':

from .pyicu import romanize


elif engine=='thai2rom':

from pythainlp.romanization.thai2rom import thai2rom

thai=thai2rom()

return thai.romanization(data)

else:

raise Exception("error no have engine.")

try:

word_list=word_tokenize(data)

i=0

while i<len(word_list):

listword.append(romanize(word_list[i]))

i+=1

except:

listword=[romanize(data)]

return ''.join(listword)


#print(romanization('เกรียง'))