# Jan Labanowski, jkl@osc.edu, Jan. 10, 1992 # File lc_koi8.dat # This is a transliteration data file for converting from various phonetic # transliteration schemes to KOI-8 as used by RELCOM (GOST 19768-74). # It is not possible to exactly represent phonetic transliteration # since it is very flexible and frequently contradictory. # This file is compilcated (in my humble opinion) and it will take # a substantial amount of time to process longer files on a slower # computer. However, this is what you get, of you want something more # or less general. If your phonetic transliteration is consistent # and unequivocal, you will be much better of to write a specific # transliteration file, e.g. the GOST 16876-71 transliteration file (phg) or # Pokrovsky scheme (php). # The English text should be embraced in braces {}, while Russian one, # is outside the braces. # To be used with translit.c program by Jan Labanowski. For a format of # this file consult translit documenation 1 file version number " " # string delimiters [ ] # list delimites { } # regular expression delimiters #starting sequence "" #ending sequence "" 2 # number of input SHIFT sequences "" "" "" "" "" "" # no SHIFT-OUT/IN for Russian letters "{" "" "" "" "}" "" # Latin text in braces {} 0 # number of output SHIFT sequences, two sets of input characters # conversion table # inp_set inp_seq out_set out_seq # Latin(ASCII) is embraced in {} 2 ["'A-Za-z] 0 ["'A-Za-z] # Cyrillic letters # If already converted to KOI8 by backstepping, send it to output 1 {([\0x80-\0xFF])} 0 {\1} # Convert " followed by a capital letter to capital Tvyordyj znak and backstep # otherwise " will be treated as a small tvyordyj znak 1 {"([^A-Za-z])} 0 {"\1} # " at the end 1 {([^A-Za-z])"} 0 {\1"} # " at the beginning 1 {"([A-Z])} -2 {\0xFF\1} # capital Tvyordyj znak 1 {Q[Hh]} 0 "\0xFF" # Some use it as Tvyordyj 1 "qh" 0 "\0xDF" # Some use it as tvyordyj 1 "\0x22" 0 "\0xDF" # tvyordyj znak # Convert ' preceded by a capital letter to capital Myagkij znak and backstep # otherwise ' will be trated as a small myagkij znak 1 "''" 0 "''" # double quote 1 {([^A-Za-z])'} 0 {\1'} # opening quote # Muagkij znak 1 {((S[Hh][Cc][Hh])|(S[Hh])|(C[Hh])|(T[Cc][Hh])|([A-Z]))'} -1 {\1\0xF8} 1 "Q" 0 "\0xF8" 1 "'" 0 "\0xD8" # myagkij znak 1 "q" 0 "\0xD8" 1 {(([YIJ]?[EOUA])|([J]?[EOAUY]))((Y)|([IJ]))([^A-Za-z])} -1 {\1\0xEA\7} #-J # 12 3 45 6 7 1 {(([yij]?[eoua])|([j]?[eoauy]))((y)|([ij]))([^A-Za-z])} -1 {\1\0xCA\7} #-j # 12 3 45 6 7 # the story of ts versus c (the ts for c was a stupid idea of Library of # Congress --- very, very stupid... T and S should be T and S, not C). 1 "INTS" 0 "\0xE9\0xED\0xE3" #INC 1 "INC" 0 "\0xE9\0xED\0xE3" #INC 1 "ints" 0 "\0xC9\0xCD\0xC3" #inc 1 "inc" 0 "\0xC9\0xCD\0xC3" #inc 1 "CI" 0 "\0xE3\0xE9" 1 "ci" 0 "\0xC3\0xC9" 1 {AVIA(TS|C)} 0 "\0xE1\0xE2\0xE9\0xE1\0xE3" #aviac 1 {avia(ts|c)} 0 "\0xC1\0xC2\0xC9\0xC1\0xC3" 1 {tsi([iyjo])} -2 {\0xC3\0xC9\1} # ci 1 {TSI([IYJO])} -2 {\0xE3\0xE9\1} # ci 1 {T[Ss]([Aa])} -2 {\0xE3\1} # CA 1 {t[Ss]([Aa])} -2 {\0xC3\1} # ca 1 {([DdKk])T[Ss]} -1 {\1\0xE3} # DC or KC 1 {([DdKk])t[Ss]} -1 {\1\0xC3} # dc or kc 1 {TS([^A-Za-z])} -2 {\0xE3\1} # C 1 {ts([^A-Za-z])} -2 {\0xC3\1} # c # Je --- people frequently write e instead of Je. E oborotnoje is # frequently at the beginning of foreign origin words 1 "AER" 0 "\0xE1\0xFC\0xF2" 1 {([Aa])er} -1 {\1\0xDC\0xD2} 1 {([A-Za-z])'[IiYyJj]?E} -1 {\1\0xF8\0xE5} # Je 1 {([A-Za-z])'[IiYyJj]?e} -1 {\1\0xD8\0xC5} # je # Capital Je 1 {([^A-Za-z])E(([Mm][Uu]?[^A-Za-z])|([Mm][Ll])|([Ll][^EeIiLlYyJj'])\ |([Ll][YyIiJj]?[Ee][^A-Za-z])|([Rr][Uu])|([Ss][HhTtLl])|([Kk][Aa]))} -1 {\1\0xE5\2} # Je # Small je 1 {([^A-Za-z])e(([Mm][Uu]?[^A-Za-z])|([Mm][Ll])|([Ll][^EeIiLlYyJj'])\ |([Ll][YyIiJj]?[Ee][^A-Za-z])|([Rr][Uu])|([Ss][HhTtLl])|([Kk][Aa]))} -1 {\1\0xC5\2} # je # Capital Eh 1 {([^A-Za-z])E(([Ll][Ee][KkGg])|([KLMNPRSTFklmnprstf]))} -1 {\1\0xFC\2} #Eh # Small eh 1 {([^A-Za-z])e(([Ll][Ee][KkGg])|([KLMNPRSTFklmnprstf]))} -1 {\1\0xDC\2} #eh 1 {([iIOoPpUuFfYy])i[Ee]} -1 {\1\0xC5} # ie->je 1 {([iIOoPpUuFfYy])I[Ee]} -1 {\1\0xE5} # ie->je # Eh is e oborotnoje but not at the end of the word 1 {E[Hh]([^A-Za-z]+)} 0 {\0xE5\0xE8\1} 1 {e[Hh]([^A-Za-z]+)} 0 {\0xC5\0xC8\1} 1 {E[Hh]} 0 "\0xFC" # E oborotnoje 1 "eh" 0 "\0xDC" # e oboritnoje # Various I kratkoe 1 {J[Ii]} 0 "\0xEA" # I kratkoje 1 {J[Jj]} 0 "\0xEA" 1 "ji" 0 "\0xCA" # i kratkoje 1 "jj" 0 "\0xCA" # SHCH 1 {s[Hh][Cc][Hh]} 0 "\0xDD" 1 "w" 0 "\0xDD" 1 {S[Hh][Cc][Hh]} 0 "\0xFD" 1 "W" 0 "\0xFD" 1 {[YJ][Oo]} 0 "\0xB3" # capital Jo 1 {J[Ee]} 0 "\0xE5" # Je 1 {RIU(M[^A-Za-z])} -1 {\0xF2\0xE9\0xF5\2} # IU 1 {([^A-Za-z])I([Uu][Dd])} -1 {\1\0xE9\2} 1 "DIUS" 0 "\0xE4\0xE9\0xF5\0xF3" 1 {[IYJ][Uu]} 0 "\0xE0" # Ju 1 {([Dd])I([Aa][KkGgPp])} -1 {\1\0xE9\2} # dia 1 "RIAL" 0 "\0xF2\0xE9\0xE1\0xEC" # rial 1 "KIA" 0 "\0xEB\0xE9\0xE1" # kia 1 {[IYJ][Aa]} 0 "\0xF1" # Ja 1 {Z[Hh]} 0 "\0xF6" 1 {K[Hh]} 0 "\0xE8" 1 {H[Hh]} 0 "\0xE8" 1 {C[Hh]} 0 "\0xFE" 1 {S[Hh]} 0 "\0xFB" 1 "zh" 0 "\0xD6" 1 "kh" 0 "\0xC8" 1 "hh" 0 "\0xC8" 1 "ch" 0 "\0xDE" 1 "sh" 0 "\0xDB" 1 {[yj]o} 0 "\0xA3" #jo 1 "je" 0 "\0xC5" #je 1 {([Rr])iu([Mm][^A-Za-z])} -1 {\1\0xC9\0xD5\2} # iu 1 {([^A-Za-z])i(ud)} -1 {\1\0xC9\2} 1 "dius" 0 "\0xC4\0xC9\0xD5\0xD3" 1 {[iyj]u} 0 "\0xC0" #ju 1 {([Dd])ia([kgp])} -1 {\1\0xC9\0xC1\2} # dia 1 "rial" 0 "\0xD2\0xC9\0xC1\0xCC" # rial 1 "kia" 0 "\0xCB\0xC9\0xC1" # kia 1 {[iyj]a} 0 "\0xD1" #ja 1 "A" 0 "\0xE1" 1 "B" 0 "\0xE2" 1 "V" 0 "\0xF7" 1 "G" 0 "\0xE7" 1 "D" 0 "\0xE4" 1 "Z" 0 "\0xFA" 1 "I" 0 "\0xE9" 1 "J" 0 "\0xEA" # I kratkoje 1 "K" 0 "\0xEB" 1 "L" 0 "\0xEC" 1 "M" 0 "\0xED" 1 "N" 0 "\0xEE" 1 "O" 0 "\0xEF" 1 "P" 0 "\0xF0" 1 "R" 0 "\0xF2" 1 "S" 0 "\0xF3" 1 "T" 0 "\0xF4" 1 "U" 0 "\0xF5" 1 "F" 0 "\0xE6" 1 "X" 0 "\0xE8" # Kha 1 "H" 0 "\0xE8" # Kha 1 "C" 0 "\0xE3" 1 "Y" 0 "\0xF9" 1 "E" 0 "\0xE5" #Je 1 "a" 0 "\0xC1" 1 "b" 0 "\0xC2" 1 "v" 0 "\0xD7" 1 "g" 0 "\0xC7" 1 "d" 0 "\0xC4" 1 "z" 0 "\0xDA" 1 "i" 0 "\0xC9" 1 "j" 0 "\0xCA" 1 "k" 0 "\0xCB" 1 "l" 0 "\0xCC" 1 "m" 0 "\0xCD" 1 "n" 0 "\0xCE" 1 "o" 0 "\0xCF" 1 "p" 0 "\0xD0" 1 "r" 0 "\0xD2" 1 "s" 0 "\0xD3" 1 "t" 0 "\0xD4" 1 "u" 0 "\0xD5" 1 "f" 0 "\0xC6" 1 "x" 0 "\0xC8" # kha 1 "h" 0 "\0xC8" # kha 1 "c" 0 "\0xC3" 1 "y" 0 "\0xD9" 1 "e" 0 "\0xC5" # je