中文處理 (一)

程式列表

#!/usr/bin/python # -*- coding: UTF-8 -*- import sys import fileinput # 轉換全形英文字母為半形字 def ConvertChinEnglish(textLine): myTextLine = textLine myTextLine = myTextLine.replace("A", "A") myTextLine = myTextLine.replace("B", "B") myTextLine = myTextLine.replace("C", "C") myTextLine = myTextLine.replace("D", "D") myTextLine = myTextLine.replace("E", "E") myTextLine = myTextLine.replace("F", "F") myTextLine = myTextLine.replace("G", "G") myTextLine = myTextLine.replace("H", "H") myTextLine = myTextLine.replace("I", "I") myTextLine = myTextLine.replace("J", "J") myTextLine = myTextLine.replace("K", "K") myTextLine = myTextLine.replace("L", "L") myTextLine = myTextLine.replace("M", "M") myTextLine = myTextLine.replace("N", "N") myTextLine = myTextLine.replace("O", "O") myTextLine = myTextLine.replace("P", "P") myTextLine = myTextLine.replace("Q", "Q") myTextLine = myTextLine.replace("R", "R") myTextLine = myTextLine.replace("S", "S") myTextLine = myTextLine.replace("T", "T") myTextLine = myTextLine.replace("U", "U") myTextLine = myTextLine.replace("V", "V") myTextLine = myTextLine.replace("W", "W") myTextLine = myTextLine.replace("X", "X") myTextLine = myTextLine.replace("Y", "Y") myTextLine = myTextLine.replace("Z", "Z") return myTextLine # 轉換全形標點符號及四則運算為半形字 def ConvertSymbols(textLine): myTextLine = textLine myTextLine = myTextLine.replace("(", "(") myTextLine = myTextLine.replace(")", ")") myTextLine = myTextLine.replace("%", "%") myTextLine = myTextLine.replace("個百分點", "%") myTextLine = myTextLine.replace("百分點", "%") myTextLine = myTextLine.replace(".", ".") myTextLine = myTextLine.replace(" ", "") myTextLine = myTextLine.replace("+", "+") myTextLine = myTextLine.replace("-", "-") myTextLine = myTextLine.replace("*", "*") myTextLine = myTextLine.replace("/", "/") return myTextLine # 轉換中文數字成阿拉伯數字 def ConvertNumbers(textLine): myTextLine = textLine myTextLine = myTextLine.replace("一", "1") myTextLine = myTextLine.replace("二", "2") myTextLine = myTextLine.replace("三", "3") myTextLine = myTextLine.replace("四", "4") myTextLine = myTextLine.replace("五", "5") myTextLine = myTextLine.replace("六", "6") myTextLine = myTextLine.replace("七", "7") myTextLine = myTextLine.replace("八", "8") myTextLine = myTextLine.replace("九", "9") myTextLine = myTextLine.replace("○", "0") return myTextLine # 讀取 UTF8 編碼文字檔案 def ReadTextFile(fileName): # 開啟文字檔 myFile = open(fileName, "r") # 讀取每一列 for textLine in myFile: # 移除字串頭尾空白 textLine = textLine.strip() if len(textLine) == 0: continue # 轉換全形標點符號及四則運算為半形字 textLine = ConvertSymbols(textLine) # 轉換中文數字成阿拉伯數字 textLine = ConvertNumbers(textLine) # 轉換全形英文字母為半形字 textLine = ConvertChinEnglish(textLine) # 輸出轉換後字串 sys.stdout.write(textLine) # 關閉檔案 myFile.close() # 主程式 if __name__ == "__main__": if len(sys.argv) > 0: # 讀取 UTF8 編碼文字檔案 ReadTextFile(sys.argv[1])