2010-10-18 14 views
0

J'essaie de convertir du texte ancien en caractères ascii non-anglais en une nouvelle police unicode. Donc les clés doivent être cartographiées. Je dois option. La première chose que j'est d'avoir un fichier carte comme cette sample.map (txtfile)Convertir Ascii 2 Unicode à partir d'une carte de polices en utilisant Php

w=à´‚ 
x=à´ƒ 
A=à´… 
B=à´† 
C=à´‡ 
Cu=à´ˆ 
D=à´‰ 
Du=à´Š 
E=à´‹ 
\p=ഌ 
F=à´Ž 
G=à´ 
sF=à´ 
H=à´’ 
Hm=à´“ 
Hu=à´” 
I=à´• 
J=à´– 

Le code devra remplacer tout le caractère latéral gauche avec le côté droite. Comment faire une boucle sur chaque caractère et les remplacer par prendre des informations à partir du fichier de carte. J'ai essayé avec des techniques de trouver et remplacer mais avec l'échec. Comment puis-je faire php lire ce fichier de carte particulier quel fichier txt avec l'extension .map et boucle à travers chaque char et le remplacer sans détruire le document?

Here is complete map file

J'ai aussi trouvé un script python pour ce faire, je ne pouvais pas le port à php. Je suis très faible en python Je suis coller le code ici:

import sys 
import codecs 
import os 
from optparse import OptionParser 

class Payyan: 

def __init__(self): 
    self.input_filename ="" 
    self.output_filename="" 
    self.mapping_filename="" 
    self.rulesDict=None 
    self.pdf=0 

def word2ASCII(self, unicode_text): 
    index = 0 
    prebase_letter = "" 
    ascii_text="" 
    self.direction = "u2a" 
    self.rulesDict = self.LoadRules() 
    while index < len(unicode_text): 
    '''This takes care of conjuncts ''' 
    for charNo in [3,2,1]: 
    letter = unicode_text[index:index+charNo] 
    if letter in self.rulesDict: 
    ascii_letter = self.rulesDict[letter] 
    letter = letter.encode('utf-8') 
    '''Fixing the prebase mathra''' 
    '''TODO: Make it generic , so that usable for all indian languages''' 
    if letter == 'ൈ': 
     ascii_text = ascii_text[:-1] + ascii_letter*2 + ascii_text[-1:] 
    elif (letter == 'ോ') | (letter == 'ൊ') | (letter == 'ൌ'): #prebase+postbase mathra case 
     ascii_text = ascii_text[:-1] + ascii_letter[0] + ascii_text[-1:] + ascii_letter[1] 
    elif (letter == 'െ') | (letter == 'േ') |(letter == 'àµà´°'): #only prebase 
     ascii_text = ascii_text[:-1] + ascii_letter + ascii_text[-1:] 
    else: 
     ascii_text = ascii_text + ascii_letter  
    index = index+charNo 
    break 
    else: 
    if(charNo==1): 
     index=index+1 
     ascii_text = ascii_text + letter 
     break; 
    '''Did not get'''  
    ascii_letter = letter 

    return ascii_text 

def Uni2Ascii(self): 
    if self.input_filename : 
    uni_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore') 
    else : 
    uni_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore') 
    text = "" 
    if self.output_filename : 
    output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore', mode='w+') 
    while 1: 
     text =uni_file.readline() 
    if text == "": 
    break 
    ascii_text = "" 
    ascii_text = self.word2ASCII(text) 

    if self.output_filename : 
    output_file.write(ascii_text) 
    else: 
    print ascii_text.encode('utf-8') 
    return 0 

def word2Unicode(self, ascii_text): 
    index = 0 
    post_index = 0 
    prebase_letter = "" 
    postbase_letter = "" 
    unicode_text = "" 
    next_ucode_letter = "" 
    self.direction="a2u" 
    self.rulesDict = self.LoadRules() 
    while index < len(ascii_text): 
    for charNo in [2,1]: 
    letter = ascii_text[index:index+charNo] 
    if letter in self.rulesDict: 
    unicode_letter = self.rulesDict[letter] 
    if(self.isPrebase(unicode_letter)): 
     prebase_letter = unicode_letter 
    else: 
     post_index = index+charNo 
     if post_index < len(ascii_text): 
     letter = ascii_text[post_index] 
     if letter in self.rulesDict: 
     next_ucode_letter = self.rulesDict[letter] 
     if self.isPostbase(next_ucode_letter): 
     postbase_letter = next_ucode_letter 
     index = index + 1 
     if ((unicode_letter.encode('utf-8') == "à´Ž") | 
      (unicode_letter.encode('utf-8') == "à´’")): 
     unicode_text = unicode_text + postbase_letter + self.getVowelSign(prebase_letter , unicode_letter) 
     else: 
     unicode_text = unicode_text + unicode_letter + postbase_letter + prebase_letter 
     prebase_letter="" 
     postbase_letter="" 
    index = index + charNo 
    break 
    else: 
    if charNo == 1: 
     unicode_text = unicode_text + letter 
     index = index + 1 
     break 
    unicode_letter = letter 
    return unicode_text 

def Ascii2Uni(self): 
    if self.pdf : 
    command = "pdftotext '" + self.input_filename +"'" 
    process = os.popen(command, 'r') 
    status = process.close() 
    if status: 
    print "The input file is a PDF file. To convert this the pdftotext utility is required. " 
    print "This feature is available only for GNU/Linux Operating system." 
    return 1 # Error - no pdftotext ! 
    else: 
    self.input_filename = os.path.splitext(self.input_filename)[0] + ".txt" 
    if self.input_filename : 
    ascii_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore') 
    else : 
    ascii_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore') 

    text = "" 
    if self.output_filename : 
    output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore', mode='w+') 

    while 1: 
     text =ascii_file.readline() 
    if text == "": 
    break 
    unicode_text = "" 
    unicode_text = self.word2Unicode(text) 

    if self.output_filename : 
    output_file.write(unicode_text) 
    else: 
    print unicode_text.encode('utf-8') 
    return 0 

def getVowelSign(self, vowel_letter, vowel_sign_letter): 
    vowel= vowel_letter.encode('utf-8') 
    vowel_sign= vowel_sign_letter.encode('utf-8') 
    if vowel == "à´Ž": 
    if vowel_sign == "െ": 
    return "à´" 
    if vowel == "à´’": 
    if vowel_sign == "à´¾": 
    return "à´“" 
    if vowel_sign =="ൗ": 
    return "à´”" 
    return (vowel_letter+ vowel_sign_letter) 

def isPrebase(self, letter): 
    unicode_letter = letter.encode('utf-8') 
    if( (unicode_letter == "േ" ) | ( unicode_letter == "ൈ") | (unicode_letter == "ൊ") | (unicode_letter == "ോ" ) | (unicode_letter == "ൌ" ) 
     | (unicode_letter == "àµà´°" ) | (unicode_letter == "െ" ) 
     ): 
    return True 
    else: 
    return False 

def isPostbase(self, letter): 
    unicode_letter = letter.encode('utf-8') 
    if ((unicode_letter == "àµà´¯") | (unicode_letter == "àµà´µ")): 
    return True 
    else: 
    return False 

def LoadRules(self): 
    if(self.rulesDict): 
    return self.rulesDict 
    rules_dict = dict() 
    line = [] 
    line_number = 0 
    rules_file = codecs. open(self.mapping_filename,encoding='utf-8', errors='ignore') 
    while 1: 
    ''' Keep the line number. Required for error reporting''' 
    line_number = line_number +1 
     text = unicode(rules_file.readline()) 
    if text == "": 
     break 
    '''Ignore the comments''' 
    if text[0] == '#': 
     continue 
    line = text.strip() 
    if(line == ""): 
     continue 
    if(len(line.split("=")) != 2): 
    print "Error: Syntax Error in the Ascii to Unicode Map in line number ", line_number 
     print "Line: "+ text 
     return 2 # Error - Syntax error in Mapping file 
    lhs = line.split("=") [ 0 ] 
    rhs = line.split("=") [ 1 ] 
    if self.direction == 'a2u': 
    rules_dict[lhs]=rhs 
    else: 
    rules_dict[rhs]=lhs 
    return rules_dict 

MISE À JOUR: Je pense que j'ai eu tort de la part ascii. C'est un texte écrit en caractères non-anglais. Ce que je veux convertir en police unicode afin qu'il soit correctement affiché sur

Répondre

0

ASCII est vraiment juste 7 bits. Une supposition est que vous traitez éventuellement avec ISO-8859-1, à convertir en UTF-8 ou autre encodage Unicode. Iconv peut être utilisé:

http://php.net/manual/en/function.iconv.php

+0

peut être ce que je veux dire un texte écrit dans un éditeur de texte avec la police en particulier. – esafwan

0

Il suffit de garder à l'esprit que PHP str_replace remplacera de gauche à droite. Dans ce cas, il suffit de trier le tableau de valeurs d'origine dans l'ordre décroissant, de sorte que par exemple. 'ss' est rencontré avant 's' (sinon 'ss' sera remplacé par 'àμƒàμƒ' au lieu de 'àμ').

$original = array("€", "Å“", "Å’", "ž", "Ž", "Ÿ", "Å¡", "Å ", "Ù", "À", "Û", "Ë", "É", "Ã…", "Õ", "Ç", "Æ", "Ä", "Ô", "Ó", "Â", "Ã’", "Ñ", "×", "Ö", "Ø", "È", "Ã", "Þ", "ÃŽ", "ß", "Ú", "Ê", "Ü", "ÃŒ", "þ", "õ", "ô", "ó", "ò", "ñ", "ð", "ï", "î", "Ã¥", "ä", "ã", "â", "á", "à ", "Ã", "Ã", "Ã", "Ã", "Ã", "¿", "¾", "½", "¼", "»", "º", "¹", "¸", "·", "¶", "µ", "´", "³", "²", "±", "°", "¯", "®", "¬", "«", "ª", "©", "¨", "§", "¦", "Â¥", "¤", "£", "¢", "¡", "­", "}", "|", "{", "z", "y", "x", "w", "v", "u", "tm", "t", "su", "ss", "sm", "sF", "s", "r", "q", "p", "o", "n", "m", "l", "k", "j", "i", "h", "g", "f", "e", "d", "c", "b", "a", "`", "_", "^", "]", "\p", "\", "[", "Z", "Y", "X", "W", "V", "U", "T", "S", "R", "Q", "P", "O", "N", "M", "L", "K", "J", "I", "Hu", "Hm", "H", "G", "F", "E", "Du", "D", "Cu", "C", "B", "A", "$"); 
$replaced = array("à´—àµà´—", "à´®àµà´®", "à´®àµà´ª", "à´ªàµà´ª", "à´¨àµà´¤", "à´®àµà´²", "à´šàµà´š", "à´™àµà´•", "à´¸àµà´¥", "à´¨àµà´¦", "à´¤àµà´­", "à´¹àµà´²", "à´³àµà´³", "à´®àµà´®", "à´žàµà´š", "à´¯àµà´¯", "à´µàµà´µ", "à´®àµà´ª", "à´¨àµà´§", "à´¹àµà´¨", "à´¨àµà´±", "à´¹àµà´®", "à´šàµà´›", "à´£àµà´®", "à´œàµà´œ", "à´¸àµà´¥", "à´²àµà´²", "à´ªàµà´ª", "à´£àµà´¡", "à´•àµà´Ÿ", "à´¤àµà´®", "à´œàµà´ž", "à´±àµà´±", "à´—àµà´®", "à´±àµà´±", "-", "à´­", "à´¸àµà´¸", "à´¨àµà´¨", "à´¨àµà´®", "à´²àµà´²", "à´²àµâ€", "à´¨àµà´®", "à´¨àµ", "à´±àµà´±", "à´·àµà´Ÿ", "à´¨àµà´±", "à´—àµà´¨", "à´£àµà´Ÿ", "à´•àµà´¤", "à´¶àµà´š", "à´¨àµà´¤", "à´¬àµà´§", "à´¡àµà´¡", "à´¨àµà´¨", "à´¤àµà´¤", "à´£àµà´Ÿ", "à´Ÿàµà´Ÿ", "à´žàµà´ž", "à´šàµà´š", "à´™àµà´™", "à´™àµà´•", "à´•àµà´·", "à´¨àµà´®", "à´¨àµà´¨", "à´¨àµà´¦", "à´¨àµà´¤", "à´³àµâ€", "à´²àµâ€", "à´°àµâ€", "à´¨àµâ€", "à´¤àµà´¤", "à´£àµà´£", "à´£àµà´Ÿ", "à´£àµâ€", "à´Ÿàµà´Ÿ", "à´¦àµà´§", "à´žàµà´š", "à´“", "à´™àµà´™", "à´™àµà´•", "à´¦àµà´¦", "à´ˆ", "à´•àµà´·", "à´•àµà´²", "à´•àµà´•", "àµà´°", "àµà´µ", "àµà´°", "àµà´µ", "àµà´¯", "à´ƒ", "à´‚", "ൌ", "േ", "ോ", "െ", "ൌ", "ൈ", "ൊ", "à´", "ൃ", "ൂ", "àµ", "ീ", "à´¿", "à´¾", "àµ", "à´±", "à´´", "à´³", "à´¹", "à´¸", "à´·", "à´¶", "à´µ", "à´²", "à´°", "à´¯", "à´®", "à´­", "à´¬", "à´«", "à´ª", "à´Œ", "à´¨", "à´§", "à´¦", "à´¥", "à´¤", "à´£", "à´¢", "à´¡", "à´ ", "à´Ÿ", "à´ž", "à´", "à´œ", "à´›", "à´š", "à´™", "à´˜", "à´—", "à´–", "à´•", "à´”", "à´“", "à´’", "à´", "à´Ž", "à´‹", "à´Š", "à´‰", "à´ˆ", "à´‡", "à´†", "à´…", "à´¸àµà´±àµà´±"); 

$new_string = str_replace($original, $replaced, $old_string); 

Note: le jeu de caractères doit être Windows 1252, pas UTF-8, comme str_replace avec des tableaux est imprédictible avec le remplacement multi-octets.