# Jan Labanowski, jkl@osc.edu, Jan. 10, 1992 # File lc_koi8.dat

# This is a transliteration data file for converting from various phonetic
# transliteration schemes to KOI-8 as used by RELCOM (GOST 19768-74).
# It is not possible to exactly represent phonetic transliteration
# since it is very flexible and frequently contradictory.
# This file is compilcated (in my humble opinion) and it will take
# a substantial amount of time to process longer files on a slower
# computer. However, this is what you get, of you want something more
# or less general. If your phonetic transliteration is consistent
# and unequivocal, you will be much better of to write a specific
# transliteration file, e.g. the GOST 16876-71 transliteration file (phg) or
# Pokrovsky scheme (php).
# The English text should be embraced in braces {}, while Russian one,
# is outside the braces.
# To be used with translit.c program by Jan Labanowski. For a format of
# this file consult translit documenation

   1            file version number

   "    "      # string delimiters
   [    ]      # list delimites
   {    }      # regular expression delimiters


#starting sequence
""

#ending sequence
""

   2     # number of input SHIFT sequences
""  ""  ""  ""  ""  ""    # no SHIFT-OUT/IN for Russian letters
"{" ""  ""  ""  "}" ""    # Latin text in braces {}

   0     # number of output SHIFT sequences, two sets of input characters

# conversion table
# inp_set     inp_seq           out_set    out_seq

# Latin(ASCII) is embraced in {}
     2         ["'A-Za-z]           0  ["'A-Za-z]

# Cyrillic letters

# If already converted to KOI8 by backstepping, send it to output
     1 {([\0x80-\0xFF])}    0         {\1}

# Convert " followed by a capital letter to capital Tvyordyj znak and backstep
# otherwise " will be treated as a small tvyordyj znak
     1 {"([^A-Za-z])}  0         {"\1}             # " at the end
     1 {([^A-Za-z])"}  0         {\1"}             # " at the beginning
     1 {"([A-Z])}     -2         {\0xFF\1}         # capital Tvyordyj znak
     

     1 {Q[Hh]}         0         "\0xFF"           # Some use it as Tvyordyj
     1 "qh"            0         "\0xDF"           # Some use it as tvyordyj
     1 "\0x22"         0         "\0xDF"           # tvyordyj znak

# Convert ' preceded by a capital letter to capital Myagkij znak and backstep
# otherwise ' will be trated as a small myagkij znak

     1 "''"                 0         "''"              # double quote
     1 {([^A-Za-z])'}       0         {\1'}             # opening quote
# Muagkij znak
 1 {((S[Hh][Cc][Hh])|(S[Hh])|(C[Hh])|(T[Cc][Hh])|([A-Z]))'} -1  {\1\0xF8}
                       
     1       "Q"            0         "\0xF8"

     1       "'"            0         "\0xD8"           # myagkij znak
     1       "q"            0         "\0xD8"

 1 {(([YIJ]?[EOUA])|([J]?[EOAUY]))((Y)|([IJ]))([^A-Za-z])} -1  {\1\0xEA\7} #-J
#   12              3             45   6      7
 1 {(([yij]?[eoua])|([j]?[eoauy]))((y)|([ij]))([^A-Za-z])} -1  {\1\0xCA\7} #-j
#   12              3             45   6      7

# the story of ts versus c (the ts for c was a stupid idea of Library of
# Congress --- very, very stupid... T and S should be T and S, not C).
     1 "INTS"                       0       "\0xE9\0xED\0xE3"  #INC
     1 "INC"                        0       "\0xE9\0xED\0xE3"  #INC
     1 "ints"                       0       "\0xC9\0xCD\0xC3"  #inc
     1 "inc"                        0       "\0xC9\0xCD\0xC3"  #inc
     1 "CI"                         0       "\0xE3\0xE9"
     1 "ci"                         0       "\0xC3\0xC9"
     1 {AVIA(TS|C)}                 0       "\0xE1\0xE2\0xE9\0xE1\0xE3" #aviac
     1 {avia(ts|c)}                 0       "\0xC1\0xC2\0xC9\0xC1\0xC3" 
     1 {tsi([iyjo])}               -2       {\0xC3\0xC9\1}    # ci
     1 {TSI([IYJO])}               -2       {\0xE3\0xE9\1}    # ci
     1 {T[Ss]([Aa])}               -2       {\0xE3\1}         # CA
     1 {t[Ss]([Aa])}               -2       {\0xC3\1}         # ca
     1 {([DdKk])T[Ss]}             -1       {\1\0xE3}         # DC or KC
     1 {([DdKk])t[Ss]}             -1       {\1\0xC3}         # dc or kc
     1 {TS([^A-Za-z])}             -2       {\0xE3\1}         # C
     1 {ts([^A-Za-z])}             -2       {\0xC3\1}         # c

#    Je --- people frequently write e instead of Je. E oborotnoje is
#    frequently at the beginning of foreign origin words

     1 "AER"                                       0   "\0xE1\0xFC\0xF2"
     1 {([Aa])er}                                 -1   {\1\0xDC\0xD2}

     1 {([A-Za-z])'[IiYyJj]?E}                    -1   {\1\0xF8\0xE5} # Je
     1 {([A-Za-z])'[IiYyJj]?e}                    -1   {\1\0xD8\0xC5} # je

# Capital Je
 1 {([^A-Za-z])E(([Mm][Uu]?[^A-Za-z])|([Mm][Ll])|([Ll][^EeIiLlYyJj'])\
        |([Ll][YyIiJj]?[Ee][^A-Za-z])|([Rr][Uu])|([Ss][HhTtLl])|([Kk][Aa]))}
   
                                                  -1 {\1\0xE5\2}   # Je
# Small je
 1 {([^A-Za-z])e(([Mm][Uu]?[^A-Za-z])|([Mm][Ll])|([Ll][^EeIiLlYyJj'])\
        |([Ll][YyIiJj]?[Ee][^A-Za-z])|([Rr][Uu])|([Ss][HhTtLl])|([Kk][Aa]))}
                                                  -1 {\1\0xC5\2}   # je

# Capital Eh
 1 {([^A-Za-z])E(([Ll][Ee][KkGg])|([KLMNPRSTFklmnprstf]))} -1 {\1\0xFC\2} #Eh

# Small eh
 1 {([^A-Za-z])e(([Ll][Ee][KkGg])|([KLMNPRSTFklmnprstf]))} -1 {\1\0xDC\2} #eh

     1 {([iIOoPpUuFfYy])i[Ee]}                    -1   {\1\0xC5}     # ie->je
     1 {([iIOoPpUuFfYy])I[Ee]}                    -1   {\1\0xE5}     # ie->je

#   Eh is e oborotnoje but not at the end of the word
     1 {E[Hh]([^A-Za-z]+)}  0    {\0xE5\0xE8\1}
     1 {e[Hh]([^A-Za-z]+)}  0    {\0xC5\0xC8\1}
     1       {E[Hh]}        0         "\0xFC"           # E oborotnoje
     1       "eh"           0         "\0xDC"           # e oboritnoje

#   Various I kratkoe
     1       {J[Ii]}        0         "\0xEA"           # I kratkoje
     1       {J[Jj]}        0         "\0xEA"
     1       "ji"           0         "\0xCA"           # i kratkoje
     1       "jj"           0         "\0xCA"


# SHCH
     1 {s[Hh][Cc][Hh]}      0         "\0xDD"
     1       "w"            0         "\0xDD"
     1 {S[Hh][Cc][Hh]}      0         "\0xFD"         
     1       "W"            0         "\0xFD"

                       
     1       {[YJ][Oo]}     0         "\0xB3"           # capital Jo
     1       {J[Ee]}        0         "\0xE5"           # Je
     1 {RIU(M[^A-Za-z])}   -1       {\0xF2\0xE9\0xF5\2} # IU
     1 {([^A-Za-z])I([Uu][Dd])} -1    {\1\0xE9\2}
     1 "DIUS"                    0    "\0xE4\0xE9\0xF5\0xF3"
     1       {[IYJ][Uu]}    0         "\0xE0"           # Ju
     1 {([Dd])I([Aa][KkGgPp])} -1     {\1\0xE9\2}       # dia
     1 "RIAL"               0         "\0xF2\0xE9\0xE1\0xEC"       # rial
     1 "KIA"                0         "\0xEB\0xE9\0xE1" # kia
     1       {[IYJ][Aa]}    0         "\0xF1"           # Ja
     1       {Z[Hh]}        0         "\0xF6"         
     1       {K[Hh]}        0         "\0xE8"         
     1       {H[Hh]}        0         "\0xE8"
     1       {C[Hh]}        0         "\0xFE"         
     1       {S[Hh]}        0         "\0xFB"         
     1       "zh"           0         "\0xD6"         
     1       "kh"           0         "\0xC8"
     1       "hh"           0         "\0xC8"
     1       "ch"           0         "\0xDE"         
     1       "sh"           0         "\0xDB"         
     1       {[yj]o}        0         "\0xA3"    #jo
     1       "je"           0         "\0xC5"    #je

     1 {([Rr])iu([Mm][^A-Za-z])} -1   {\1\0xC9\0xD5\2} # iu
     1 {([^A-Za-z])i(ud)}        -1   {\1\0xC9\2}
     1 "dius"                     0   "\0xC4\0xC9\0xD5\0xD3"
     1 {[iyj]u}                   0   "\0xC0"    #ju
     1 {([Dd])ia([kgp])}         -1   {\1\0xC9\0xC1\2}  # dia
     1 "rial"                     0   "\0xD2\0xC9\0xC1\0xCC"       # rial
     1 "kia"                      0   "\0xCB\0xC9\0xC1"            # kia
     1 {[iyj]a}                   0   "\0xD1"    #ja

     1       "A"            0         "\0xE1"         
     1       "B"            0         "\0xE2"         
     1       "V"            0         "\0xF7"         
     1       "G"            0         "\0xE7"         
     1       "D"            0         "\0xE4"         
     1       "Z"            0         "\0xFA"         
     1       "I"            0         "\0xE9"         
     1       "J"            0         "\0xEA"           # I kratkoje
     1       "K"            0         "\0xEB"         
     1       "L"            0         "\0xEC"         
     1       "M"            0         "\0xED"         
     1       "N"            0         "\0xEE"         
     1       "O"            0         "\0xEF"         
     1       "P"            0         "\0xF0"         
     1       "R"            0         "\0xF2"         
     1       "S"            0         "\0xF3"         
     1       "T"            0         "\0xF4"         
     1       "U"            0         "\0xF5"         
     1       "F"            0         "\0xE6"         
     1       "X"            0         "\0xE8"          # Kha
     1       "H"            0         "\0xE8"          # Kha
     1       "C"            0         "\0xE3"         
     1       "Y"            0         "\0xF9"         
     1       "E"            0         "\0xE5"          #Je
     1       "a"            0         "\0xC1"         
     1       "b"            0         "\0xC2"         
     1       "v"            0         "\0xD7"         
     1       "g"            0         "\0xC7"         
     1       "d"            0         "\0xC4"         
     1       "z"            0         "\0xDA"         
     1       "i"            0         "\0xC9"         
     1       "j"            0         "\0xCA"         
     1       "k"            0         "\0xCB"         
     1       "l"            0         "\0xCC"         
     1       "m"            0         "\0xCD"         
     1       "n"            0         "\0xCE"         
     1       "o"            0         "\0xCF"         
     1       "p"            0         "\0xD0"         
     1       "r"            0         "\0xD2"         
     1       "s"            0         "\0xD3"         
     1       "t"            0         "\0xD4"         
     1       "u"            0         "\0xD5"         
     1       "f"            0         "\0xC6"         
     1       "x"            0         "\0xC8"      # kha
     1       "h"            0         "\0xC8"      # kha
     1       "c"            0         "\0xC3"         
     1       "y"            0         "\0xD9"         
     1       "e"            0         "\0xC5"      # je