181 lines
4.1 KiB
Java
181 lines
4.1 KiB
Java
# Old Hungarian script (ISO 15924 code: Hung)
|
||
|
||
# Transliterate numbers and words
|
||
|
||
# convert words with traditional or foreign "i" written as "y"
|
||
# e.g. Áprily, Champs-Élysées, Élysée-palota, Dolly, Folly, Hollywood, jolly...
|
||
"^(Áp?ri?l|Champs-Él|[cC]i?t|Do?lák-Sa?l|[dfhjDFHJ]ol?l|Él|Fesz?t|[gG]rizz?l|Ha?rasz?t|Hat?va?n|Husz?t|[iI]n?ter?ci?t|Kéth?l|Ku?ko?r?el?l|Mind?szen?t|Nosz?t|[pP]enn|Pes?t|Re?gu?l|So?n|Szi?l|Szte?va?no?vi?t|Thö?kö?l|Vö?rös?mar?t|[zZ][lł]ot)y(.*) 0$" $1𐳐$2
|
||
"^(ÁP?RI?L|CHAMPS-ÉL|CI?T|DO?LÁK-SA?L|[DFHJ]OL?L|ÉL|FESZ?T|GRIZZ?L|HA?RASZ?T|HAT?VA?N|HUSZ?T|IN?TER?CI?T|KÉTH?L|KU?KO?R?EL?L|MIND?SZEN?T|NOSZ?T|PEN?N|PES?T|REGU?L|SON|SZI?L|SZTE?VA?NO?VI?T|THÖ?KÖ?L|VÖ?RÖS?MAR?T|Z[LŁ]OT)Y(.*) 0$" $1𐲐$2
|
||
# if the original word contains an unknown character, return without modification
|
||
"^(.*[^-0-9a-zA-ZáéëóöőúüűÁÉËÓÖŐÚÜŰ–,„”\?\;]) 0$" \1
|
||
# words with y
|
||
"^y(ard.*) 0$" 𐳒$1
|
||
"^Y([aA][rR][dD].*|[uU][cC][oO][nN].*) 0$" 𐲒$1
|
||
"^Y([bB][lL].*) 0$" 𐲑$1
|
||
"^Y(vet?te.*) 0$" 𐲐$1
|
||
"^([bB]o|[cC]owbo|[dD]ispla|[gG]ra|[pP]la)y(.*) 0$" $1𐳒$2
|
||
"^(BO|COWBO|DISPLA|GRA|PLA)Y(.*) 0$" $1𐲒$2
|
||
# don't transliterate other words with starting y
|
||
"(^[yY].*) 0$" \1
|
||
# don't transliterate words with q, but not with qu
|
||
"(^.*[qQ][^uU].*) 0$" \1
|
||
|
||
# avoid of exceeding recursion depth
|
||
# convert by 200-character parts
|
||
(.{200})(.+) $1$2
|
||
|
||
# numbers
|
||
|
||
# remove space separated zero (in LibreOffice integration)
|
||
"(\d+) 0" $1
|
||
|
||
"0: (.*) (.*)"
|
||
"1: (.*) (.*)" \1
|
||
"2: (.*) (.*)" \1\1
|
||
"3: (.*) (.*)" \1\1\1
|
||
"4: (.*) (.*)" \1\1\1\1
|
||
"5: (.*) (.*)" \2
|
||
"6: (.*) (.*)" \2\1
|
||
"7: (.*) (.*)" \2\1\1
|
||
"8: (.*) (.*)" \2\1\1\1
|
||
"9: (.*) (.*)" \2\1\1\1\1
|
||
|
||
(\d) $(\1: 𐳺 𐳻)
|
||
(\d)(\d) $(\1: 𐳼 𐳽)$2
|
||
1(\d\d) $1𐳾
|
||
(\d)(\d\d) $1𐳾$2
|
||
1(\d\d\d)$ $1𐳿
|
||
(\d{1,3})(\d\d\d) $1𐳿$2
|
||
1(\d{6})$ $1𐳿𐳿
|
||
(\d{1,3})(\d{6}) $1𐳿𐳿$2
|
||
1(\d{9})$ $1𐳿𐳿𐳿
|
||
(\d{1,3})(\d{9}) $1𐳿𐳿𐳿$2
|
||
|
||
# numbers with letters, for example dates with affixes
|
||
|
||
"(\d+)([^ ]+)" $1$2
|
||
|
||
# letters
|
||
|
||
"^(.*) 0$" $1
|
||
a(.*) 𐳀$1
|
||
A(.*) 𐲀$1
|
||
á(.*) 𐳁$1
|
||
Á(.*) 𐲁$1
|
||
b(.*) 𐳂$1
|
||
B(.*) 𐲂$1
|
||
ccs(.*) 𐳆𐳆$1
|
||
CCS(.*) 𐲆𐲆$1
|
||
cs(.*) 𐳆$1
|
||
C[sS](.*) 𐲆$1
|
||
c(.*) 𐳄$1
|
||
C(.*) 𐲄$1
|
||
d(.*) 𐳇$1
|
||
D(.*) 𐲇$1
|
||
e(.*) 𐳉$1
|
||
E(.*) 𐲉$1
|
||
é(.*) 𐳋$1
|
||
É(.*) 𐲋$1
|
||
ä(.*) 𐳋$1
|
||
Ä(.*) 𐲋$1
|
||
ë(.*) 𐳊$1
|
||
Ë(.*) 𐲊$1
|
||
f(.*) 𐳌$1
|
||
F(.*) 𐲌$1
|
||
ggy(.*) 𐳎𐳎$1
|
||
GGY(.*) 𐲎𐲎$1
|
||
gy(.*) 𐳎$1
|
||
G[yY](.*) 𐲎$1
|
||
g(.*) 𐳍$1
|
||
G(.*) 𐲍$1
|
||
h(.*) 𐳏$1
|
||
H(.*) 𐲏$1
|
||
i(.*) 𐳐$1
|
||
I(.*) 𐲐$1
|
||
í(.*) 𐳑$1
|
||
Í(.*) 𐲑$1
|
||
j(.*) 𐳒$1
|
||
J(.*) 𐲒$1
|
||
k(.*) 𐳓$1
|
||
K(.*) 𐲓$1
|
||
lly(.*) 𐳗𐳗$1
|
||
LLY(.*) 𐲗𐲗$1
|
||
ly(.*) 𐳗$1
|
||
L[yY](.*) 𐲗$1
|
||
l(.*) 𐳖$1
|
||
L(.*) 𐲖$1
|
||
m(.*) 𐳘$1
|
||
M(.*) 𐲘$1
|
||
nny(.*) 𐳚𐳚$1
|
||
NNY(.*) 𐲚𐲚$1
|
||
ny(.*) 𐳚$1
|
||
N[ny](.*) 𐲚$1
|
||
n(.*) 𐳙$1
|
||
N(.*) 𐲙$1
|
||
o(.*) 𐳛$1
|
||
O(.*) 𐲛$1
|
||
ó(.*) 𐳜$1
|
||
Ó(.*) 𐲜$1
|
||
ö(.*) 𐳞$1
|
||
Ö(.*) 𐲞$1
|
||
ő(.*) 𐳟$1
|
||
Ő(.*) 𐲟$1
|
||
p(.*) 𐳠$1
|
||
P(.*) 𐲠$1
|
||
qu(.*) 𐳓𐳮$1 # qu->kv
|
||
Qu(.*) 𐲓𐳮$1 # Qu->Kv
|
||
QU(.*) 𐲓𐲮$1 # QU->KV
|
||
r(.*) 𐳢$1
|
||
R(.*) 𐲢$1
|
||
ssz(.*) 𐳥𐳥$1
|
||
SSZ(.*) 𐲥𐲥$1
|
||
sz(.*) 𐳥$1
|
||
S[zZ](.*) 𐲥$1
|
||
sch(.*) 𐳤$1
|
||
Sch(.*) 𐲤$1
|
||
s(.*) 𐳤$1
|
||
S(.*) 𐲤$1
|
||
tty(.*) 𐳨𐳨$1
|
||
TTY(.*) 𐲨𐲨$1
|
||
ty(.*) 𐳨$1
|
||
T[yY](.*) 𐲨$1
|
||
t(.*) 𐳦$1
|
||
T(.*) 𐲦$1
|
||
u(.*) 𐳪$1
|
||
U(.*) 𐲪$1
|
||
ú(.*) 𐳫$1
|
||
Ú(.*) 𐲫$1
|
||
ü(.*) 𐳭$1
|
||
Ü(.*) 𐲭$1
|
||
ű(.*) 𐳬$1
|
||
Ű(.*) 𐲬$1
|
||
[vw](.*) 𐳮$1
|
||
[VW](.*) 𐲮$1
|
||
x(.*) 𐳓𐳥$1 # x->ksz
|
||
^X 𐲓𐳥$1 # X->KSz
|
||
X 𐲓𐲥$1 # X->KSZ
|
||
X([A-ZÁÉËÍÓÖŐÚÜŰ].*) 𐲓𐲥$1 # X->KSZ
|
||
X(.*) 𐲓𐳥$1 # X->Ksz
|
||
y(.*) 𐳐$1 # .+y->i
|
||
Y(.*) 𐲐$1 # .+Y->I
|
||
zzs(.*) 𐳰𐳰$1
|
||
ZZS(.*) 𐲰𐲰$1
|
||
zs(.*) 𐳰$1
|
||
Z[sS](.*) 𐲰$1
|
||
z(.*) 𐳯$1
|
||
Z(.*) 𐲯$1
|
||
|
||
# remove ZWSP (used for consonant disambiguation)
|
||
(.*) $1
|
||
|
||
# punctuation
|
||
”(.*) ‟$1
|
||
\;(.*) ⁏$1
|
||
\?(.*) ⸮$1
|
||
,(.*) ⹁$1
|
||
„(.*) ⹂$1
|
||
|
||
# don't modify unknown characters
|
||
(.)(.*) \1$2
|
||
(.*) \1
|