When I googled it, there weren't many samples and it was just old (Python2 version), so I made it. (Fixed because it was a bug)
python
import re
tt_ksuji = str.maketrans('1 2 3 4 5 6 7 8 90 Ichi 2', '1234567890123')
re_suji = re.compile(r'[10 million billion trillion\d]+')
re_kunit = re.compile(r'[Hundred thousand]|\d+')
re_manshin = re.compile(r'[100 billion trillion]|[^100 billion trillion]+')
TRANSUNIT = {'Ten': 10,
'Pick up': 10,
'hundred': 100,
'thousand': 1000}
TRANSMANS = {'Ten thousand': 10000,
'Billion': 100000000,
'Trillion': 1000000000000}
def kansuji2arabic(kstring: str, sep=False):
"""Convert Chinese numerals to Arabic numerals"""
def _transvalue(sj: str, re_obj=re_kunit, transdic=TRANSUNIT):
unit = 1
result = 0
for piece in reversed(re_obj.findall(sj)):
if piece in transdic:
if unit > 1:
result += unit
unit = transdic[piece]
else:
val = int(piece) if piece.isdecimal() else _transvalue(piece)
result += val * unit
unit = 1
if unit > 1:
result += unit
return result
transuji = kstring.translate(tt_ksuji)
for suji in sorted(set(re_suji.findall(transuji)), key=lambda s: len(s),
reverse=True):
if not suji.isdecimal():
arabic = _transvalue(suji, re_manshin, TRANSMANS)
arabic = '{:,}'.format(arabic) if sep else str(arabic)
transuji = transuji.replace(suji, arabic)
elif sep and len(suji) > 3:
transuji = transuji.replace(suji, '{:,}'.format(int(suji)))
return transuji
In fact, any Chinese or Arabic numeral is now normalized to a half-width Arabic numeral.
kansuji2arabic('Budget amount for fiscal year 2007: 135 million, 164 yen', True)
'2015 budget amount 1,035,001,164 yen'
Recommended Posts