mirror of
https://github.com/fergalmoran/picard.git
synced 2025-12-25 19:04:09 +00:00
Improve unicode to ascii ...
for punctuation, normalization (merging similar looking characters into the most common one), accents, and full conversion to ascii. Functions and tests moved into separate files. Although more comprehensive, the code should run faster because it eliminates several loops (inc. a loop with two unicodedata references). This is intended to form the basis of future PRs to: a. Clean up (simplify) the file naming code b. Provide script function(s) for cleaning individual tags / file name parts c. Add support for translation / transliteration plugins (which I think make more sense than being included in Picard itself). d. Support for converting Tags to ISO-8859-1 rather than ascii (since that is what is supported by ID3 at least) e. Possible additional options for allowing / preventing normalization, possible reorganisation of options to centralise all encoding settings onto one page rather than metadata, tags and file naming pages at present (to be discussed).
This commit is contained in:
233
test/test_textencoding.py
Normal file
233
test/test_textencoding.py
Normal file
@@ -0,0 +1,233 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os.path
|
||||
import unittest
|
||||
from picard import util
|
||||
#from picard.util import textencoding
|
||||
|
||||
# Set the value to true below to show the coverage of Latin characters
|
||||
show_latin2ascii_coverage = False
|
||||
|
||||
compatibility_from = (
|
||||
u"\u0132\u0133\u017F\u01C7\u01C8\u01C9\u01CA\u01CB\u01CC\u01F1" # IJijſLJLjljNJNjnjDZ
|
||||
u"\u01F2\u01F3\uFB00\uFB01\uFB02\uFB03\uFB04\uFB05\uFB06\uFF21" # DzdzfffiflffifflſtstA
|
||||
u"\uFF22\uFF23\uFF24\uFF25\uFF26\uFF27\uFF28\uFF29\uFF2A\uFF2B" # BCDEFGHIJK
|
||||
u"\uFF2C\uFF2D\uFF2E\uFF2F\uFF30\uFF31\uFF32\uFF33\uFF34\uFF35" # LMNOPQRSTU
|
||||
u"\uFF36\uFF37\uFF38\uFF39\uFF3A\uFF41\uFF42\uFF43\uFF44\uFF45" # VWXYZabcde
|
||||
u"\uFF46\uFF47\uFF48\uFF49\uFF4A\uFF4B\uFF4C\uFF4D\uFF4E\uFF4F" # fghijklmno
|
||||
u"\uFF50\uFF51\uFF52\uFF53\uFF54\uFF55\uFF56\uFF57\uFF58\uFF59" # pqrstuvwxy
|
||||
u"\uFF5A\u2100\u2101\u2102\u2105\u2106\u210A\u210B\u210C\u210D" # z℀℁ℂ℅℆ℊℋℌℍ
|
||||
u"\u210E\u2110\u2111\u2112\u2113\u2115\u2116\u2119\u211A\u211B" # ℎℐℑℒℓℕ№ℙℚℛ
|
||||
u"\u211C\u211D\u2121\u2124\u2128\u212C\u212D\u212F\u2130\u2131" # ℜℝ℡ℤℨℬℭℯℰℱ
|
||||
u"\u2133\u2134\u2139\u213B\u2145\u2146\u2147\u2148\u2149\u3371" # ℳℴℹ℻ⅅⅆⅇⅈⅉ㍱
|
||||
u"\u3372\u3373\u3374\u3375\u3376\u3377\u337A\u3380\u3381\u3383" # ㍲㍳㍴㍵㍶㍷㍺㎀㎁㎃
|
||||
u"\u3384\u3385\u3386\u3387\u3388\u3389\u338A\u338B\u338E\u338F" # ㎄㎅㎆㎇㎈㎉㎊㎋㎎㎏
|
||||
u"\u3390\u3391\u3392\u3393\u3394\u3399\u339A\u339C\u339D\u339E" # ㎐㎑㎒㎓㎔㎙㎚㎜㎝㎞
|
||||
u"\u33A9\u33AA\u33AB\u33AC\u33AD\u33B0\u33B1\u33B3\u33B4\u33B5" # ㎩㎪㎫㎬㎭㎰㎱㎳㎴㎵
|
||||
u"\u33B7\u33B8\u33B9\u33BA\u33BB\u33BD\u33BE\u33BF\u33C2\u33C3" # ㎷㎸㎹㎺㎻㎽㎾㎿㏂㏃
|
||||
u"\u33C4\u33C5\u33C7\u33C8\u33C9\u33CA\u33CB\u33CC\u33CD\u33CE" # ㏄㏅㏇㏈㏉㏊㏋㏌㏍㏎
|
||||
u"\u33CF\u33D0\u33D1\u33D2\u33D3\u33D4\u33D5\u33D6\u33D7\u33D8" # ㏏㏐㏑㏒㏓㏔㏕㏖㏗㏘
|
||||
u"\u33D9\u33DA\u33DB\u33DC\u33DD\u249C\u249D\u249E\u249F\u24A0" # ㏙㏚㏛㏜㏝⒜⒝⒞⒟⒠
|
||||
u"\u24A1\u24A2\u24A3\u24A4\u24A5\u24A6\u24A7\u24A8\u24A9\u24AA" # ⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪
|
||||
u"\u24AB\u24AC\u24AD\u24AE\u24AF\u24B0\u24B1\u24B2\u24B3\u24B4" # ⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴
|
||||
u"\u24B5\u2160\u2161\u2162\u2163\u2164\u2165\u2166\u2167\u2168" # ⒵ⅠⅡⅢⅣⅤⅥⅦⅧⅨ
|
||||
u"\u2169\u216A\u216B\u216C\u216D\u216E\u216F\u2170\u2171\u2172" # ⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲ
|
||||
u"\u2173\u2174\u2175\u2176\u2177\u2178\u2179\u217A\u217B\u217C" # ⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼ
|
||||
u"\u217D\u217E\u217F\u2474\u2475\u2476\u2477\u2478\u2479\u247A" # ⅽⅾⅿ⑴⑵⑶⑷⑸⑹⑺
|
||||
u"\u247B\u247C\u247D\u247E\u247F\u2480\u2481\u2482\u2483\u2484" # ⑻⑼⑽⑾⑿⒀⒁⒂⒃⒄
|
||||
u"\u2485\u2486\u2487\u2488\u2489\u248A\u248B\u248C\u248D\u248E" # ⒅⒆⒇⒈⒉⒊⒋⒌⒍⒎
|
||||
u"\u248F\u2490\u2491\u2492\u2493\u2494\u2495\u2496\u2497\u2498" # ⒏⒐⒑⒒⒓⒔⒕⒖⒗⒘
|
||||
u"\u2499\u249A\u249B\uFF10\uFF11\uFF12\uFF13\uFF14\uFF15\uFF16" # ⒙⒚⒛0123456
|
||||
u"\uFF17\uFF18\uFF19\u2002\u2003\u2004\u2005\u2006\u2007\u2008" # 789\u2002\u2003\u2004\u2005\u2006\u2007\u2008
|
||||
u"\u2009\u200A\u205F\uFF02\uFF07\uFE63\uFF0D\u2024\u2025\u2026" # \u2009\u200A\u205F"'﹣-․‥…
|
||||
u"\u203C\u2047\u2048\u2049\uFE10\uFE13\uFE14\uFE15\uFE16\uFE19" # ‼⁇⁈⁉︐︓︔︕︖︙
|
||||
u"\uFE30\uFE35\uFE36\uFE37\uFE38\uFE47\uFE48\uFE50\uFE52\uFE54" # ︰︵︶︷︸﹇﹈﹐﹒﹔
|
||||
u"\uFE55\uFE56\uFE57\uFE59\uFE5A\uFE5B\uFE5C\uFE5F\uFE60\uFE61" # ﹕﹖﹗﹙﹚﹛﹜﹟﹠﹡
|
||||
u"\uFE62\uFE64\uFE65\uFE66\uFE68\uFE69\uFE6A\uFE6B\uFF01\uFF03" # ﹢﹤﹥﹦﹨﹩﹪﹫!#
|
||||
u"\uFF04\uFF05\uFF06\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0E\uFF0F" # $%&()*+,./
|
||||
u"\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF1F\uFF20\uFF3B\uFF3C\uFF3D" # :;<=>?@[\]
|
||||
u"\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\u2A74\u2A75\u2A76" # ^_`{|}~⩴⩵⩶
|
||||
)
|
||||
compatibility_to = (
|
||||
u"IJijsLJLjljNJNjnjDZDzdzfffiflffifflststABCDEFGHIJKLMNOPQRSTU"
|
||||
u"VWXYZabcdefghijklmnopqrstuvwxyza/ca/sCc/oc/ugHHH"
|
||||
u"hIILlNNoPQRRRTELZZBCeEFMoiFAXDdeijhPadaAUbaroVpcdmIUpAnAmA"
|
||||
u"kAKBMBGBcalkcalpFnFmgkgHzkHzMHzGHzTHzfmnmmmcmkmPakPaMPaGParadpsnsmspVnVmVkVMVpWnWmWkWMWa.m.Bq"
|
||||
u"cccdCo.dBGyhaHPinKKKMktlmlnloglxmbmilmolPHp.m.PPMPRsrSvWb(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)"
|
||||
u"(p)(q)(r)(s)(t)(u)(v)(w)(x)(y)(z)IIIIIIIVVVIVIIVIIIIXXXIXIILCDMiiiiiiivvviviiviiiixxxixiil"
|
||||
u"cdm(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)(12)(13)(14)(15)(16)(17)(18)(19)(20)1.2.3.4.5.6.7.8.9.10.11.12.13.14.15.16.17."
|
||||
u"18.19.20.0123456789 \"'--......!!???!!?,:;!?..."
|
||||
u"..(){}[],.;:?!(){}#&*+<>=\\$%@!#$%&()*+,./"
|
||||
u":;<=>?@[\\]^_`{|}~::======"
|
||||
)
|
||||
compatibility_from += (
|
||||
u"\u1D00\u1D04\u1D05\u1D07\u1D0A\u1D0B\u1D0D\u1D0F\u1D18\u1D1B" # ᴀᴄᴅᴇᴊᴋᴍᴏᴘᴛ
|
||||
u"\u1D1C\u1D20\u1D21\u1D22\u3007\u00A0\u3000" # ᴜᴠᴡᴢ〇\u00A0\u3000
|
||||
)
|
||||
compatibility_to += u"ACDEJKMOPTUVWZ0 "
|
||||
punctuation_from = (
|
||||
u"\u2018\u2019\u201A\u201B\u201C\u201D\u201E\u201F\u2032\u301D" # ‘’‚‛“”„‟′〝
|
||||
u"\u301E\u00AB\u00BB\u2039\u203A\u00AD\u2010\u2012\u2013\u2014" # 〞«»‹›\u00AD‐‒–—
|
||||
u"\u2015\u2016\u2044\u2045\u2046\u204E\u3001\u3002\u3008\u3009" # ―‖⁄⁅⁆⁎、。〈〉
|
||||
u"\u300A\u300B\u3014\u3015\u3018\u3019\u301A\u301B\u00D7\u00F7" # 《》〔〕〘〙〚〛×÷
|
||||
u"\u2212\u2215\u2216\u2223\u2225\u226A\u226B\u2985\u2986\u00B7" # −∕∖∣∥≪≫⦅⦆·
|
||||
)
|
||||
punctuation_to = u"'','\"\",,\"'\"\"<<>><>------||/[]*,.<><<>>[][][]*/-/\\|||<<>>(())."
|
||||
combinations_from = (
|
||||
u"\u00C6\u00D0\u00D8\u00DE\u00DF\u00E6\u00F0\u00F8\u00FE\u0110" # ÆÐØÞßæðøþĐ
|
||||
u"\u0111\u0126\u0127\u0131\u0138\u0141\u0142\u014A\u014B\u0152" # đĦħıĸŁłŊŋŒ
|
||||
u"\u0153\u0166\u0167\u0180\u0181\u0182\u0183\u0187\u0188\u0189" # œŦŧƀƁƂƃƇƈƉ
|
||||
u"\u018A\u018B\u018C\u0190\u0191\u0192\u0193\u0195\u0196\u0197" # ƊƋƌƐƑƒƓƕƖƗ
|
||||
u"\u0198\u0199\u019A\u019D\u019E\u01A2\u01A3\u01A4\u01A5\u01AB" # ƘƙƚƝƞƢƣƤƥƫ
|
||||
u"\u01AC\u01AD\u01AE\u01B2\u01B3\u01B4\u01B5\u01B6\u01E4\u01E5" # ƬƭƮƲƳƴƵƶǤǥ
|
||||
u"\u0221\u0224\u0225\u0234\u0235\u0236\u0237\u0238\u0239\u023A" # ȡȤȥȴȵȶȷȸȹȺ
|
||||
u"\u023B\u023C\u023D\u023E\u023F\u0240\u0243\u0244\u0246\u0247" # ȻȼȽȾȿɀɃɄɆɇ
|
||||
u"\u0248\u0249\u024C\u024D\u024E\u024F\u0253\u0255\u0256\u0257" # ɈɉɌɍɎɏɓɕɖɗ
|
||||
u"\u025B\u025F\u0260\u0261\u0262\u0266\u0267\u0268\u026A\u026B" # ɛɟɠɡɢɦɧɨɪɫ
|
||||
u"\u026C\u026D\u0271\u0272\u0273\u0274\u027C\u027D\u027E\u0280" # ɬɭɱɲɳɴɼɽɾʀ
|
||||
u"\u0282\u0288\u0289\u028B\u028F\u0290\u0291\u0299\u029B\u029C" # ʂʈʉʋʏʐʑʙʛʜ
|
||||
u"\u029D\u029F\u02A0\u02A3\u02A5\u02A6\u02AA\u02AB\u1D03\u1D06" # ʝʟʠʣʥʦʪʫᴃᴆ
|
||||
u"\u1D0C\u1D6B\u1D6C\u1D6D\u1D6E\u1D6F\u1D70\u1D71\u1D72\u1D73" # ᴌᵫᵬᵭᵮᵯᵰᵱᵲᵳ
|
||||
u"\u1D74\u1D75\u1D76\u1D7A\u1D7B\u1D7D\u1D7E\u1D80\u1D81\u1D82" # ᵴᵵᵶᵺᵻᵽᵾᶀᶁᶂ
|
||||
u"\u1D83\u1D84\u1D85\u1D86\u1D87\u1D88\u1D89\u1D8A\u1D8C\u1D8D" # ᶃᶄᶅᶆᶇᶈᶉᶊᶌᶍ
|
||||
u"\u1D8E\u1D8F\u1D91\u1D92\u1D93\u1D96\u1D99\u1E9C\u1E9D\u1E9E" # ᶎᶏᶑᶒᶓᶖᶙẜẝẞ
|
||||
u"\u1EFA\u1EFB\u1EFC\u1EFD\u1EFE\u1EFF\u00A9\u00AE\u20A0\u20A2" # ỺỻỼỽỾỿ©®₠₢
|
||||
u"\u20A3\u20A4\u20A7\u20BA\u20B9\u211E" # ₣₤₧₺₹℞
|
||||
)
|
||||
combinations_to = (
|
||||
u"AEDOETHssaedoethDdHhiqLlNnOEoeTtbBBbCcDDDdEFfGhvII"
|
||||
U"KklNnOIoiPptTtTVYyZzGgdZzlntjdbqpACcLTszBUEe"
|
||||
u"JjRrYybcddejggGhhiIlllmnnNrrrRstuvYzzBGH"
|
||||
u"jLqdzdztslslzBDLuebdfmnprrstzthIpUbdfgklmnprsvx"
|
||||
u"zadeeiussSSLLllVvYy(C)(R)CECrFr.L.PtsTLRsRx"
|
||||
)
|
||||
ascii = u" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
|
||||
|
||||
|
||||
class CompatibilityTest(unittest.TestCase):
|
||||
|
||||
def test_correct(self):
|
||||
self.maxDiff = None
|
||||
self.assertEqual(util.textencoding.unicode_simplify_compatibility(compatibility_from), compatibility_to)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_compatibility(punctuation_from), punctuation_from)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_compatibility(combinations_from), combinations_from)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_compatibility(ascii), ascii)
|
||||
|
||||
def test_incorrect(self):
|
||||
pass
|
||||
|
||||
|
||||
class PunctuationTest(unittest.TestCase):
|
||||
|
||||
def test_correct(self):
|
||||
self.maxDiff = None
|
||||
self.assertEqual(util.textencoding.unicode_simplify_punctuation(compatibility_from), compatibility_from)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_punctuation(punctuation_from), punctuation_to)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_punctuation(combinations_from), combinations_from)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_punctuation(ascii), ascii)
|
||||
|
||||
def test_incorrect(self):
|
||||
pass
|
||||
|
||||
|
||||
class CombinationsTest(unittest.TestCase):
|
||||
|
||||
def test_correct(self):
|
||||
self.maxDiff = None
|
||||
self.assertEqual(util.textencoding.unicode_simplify_combinations(combinations_from), combinations_to)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_combinations(compatibility_from), compatibility_from)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_combinations(punctuation_from), punctuation_from)
|
||||
self.assertEqual(util.textencoding.unicode_simplify_combinations(ascii), ascii)
|
||||
|
||||
def test_incorrect(self):
|
||||
pass
|
||||
|
||||
|
||||
class AsciiPunctTest(unittest.TestCase):
|
||||
|
||||
def test_correct(self):
|
||||
self.assertEqual(util.textencoding.asciipunct(u"‘Test’"), u"'Test'") # Quotations
|
||||
self.assertEqual(util.textencoding.asciipunct(u"“Test”"), u"\"Test\"") # Quotations
|
||||
self.assertEqual(util.textencoding.asciipunct(u"1′6″"), u"1'6\"") # Quotations
|
||||
self.assertEqual(util.textencoding.asciipunct(u"…"), u"...") # Ellipses
|
||||
|
||||
def test_incorrect(self):
|
||||
pass
|
||||
|
||||
|
||||
class UnaccentTest(unittest.TestCase):
|
||||
|
||||
def test_correct(self):
|
||||
self.assertEqual(util.textencoding.unaccent(u"Lukáš"), u"Lukas")
|
||||
self.assertEqual(util.textencoding.unaccent(u"Björk"), u"Bjork")
|
||||
self.assertEqual(util.textencoding.unaccent(u"小室哲哉"), u"小室哲哉")
|
||||
|
||||
def test_incorrect(self):
|
||||
self.assertNotEqual(util.textencoding.unaccent(u"Björk"), u"Björk")
|
||||
self.assertNotEqual(util.textencoding.unaccent(u"小室哲哉"), u"Tetsuya Komuro")
|
||||
self.assertNotEqual(util.textencoding.unaccent(u"Trentemøller"), u"Trentemoller")
|
||||
self.assertNotEqual(util.textencoding.unaccent(u"Ænima"), u"AEnima")
|
||||
self.assertNotEqual(util.textencoding.unaccent(u"ænima"), u"aenima")
|
||||
|
||||
|
||||
class ReplaceNonAsciiTest(unittest.TestCase):
|
||||
|
||||
def test_correct(self):
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"Lukáš"), u"Lukas")
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"Björk"), u"Bjork")
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"Trentemøller"), u"Trentemoeller")
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"Ænima"), u"AEnima")
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"ænima"), u"aenima")
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"小室哲哉"), u"____")
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"ᴀᴄᴇ"), u"ACE") # Latin Letter Small
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"Abc"), u"Abc") # Fullwidth Latin
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"500㎏,2㎓"), u"500kg,2GHz") # Technical
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"⒜⒝⒞"), u"(a)(b)(c)") # Parenthesised Latin
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"ⅯⅯⅩⅣ"), u"MMXIV") # Roman numerals
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"ⅿⅿⅹⅳ"), u"mmxiv") # Roman numerals small
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"⑴⑵⑶"), u"(1)(2)(3)") # Parenthesised numbers
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"⒈ ⒉ ⒊"), u"1. 2. 3.") # Digit full stop
|
||||
self.assertEqual(util.textencoding.replace_non_ascii(u"123"), u"123") # Fullwidth digits
|
||||
|
||||
def test_incorrect(self):
|
||||
self.assertNotEqual(util.textencoding.replace_non_ascii(u"Lukáš"), u"Lukáš")
|
||||
self.assertNotEqual(util.textencoding.replace_non_ascii(u"Lukáš"), u"Luk____")
|
||||
|
||||
if show_latin2ascii_coverage:
|
||||
# The following code set blocks are taken from:
|
||||
# http://en.wikipedia.org/wiki/Latin_script_in_Unicode
|
||||
latin_1 = u"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
||||
latin_a = u"ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿ" \
|
||||
u"ŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽž"
|
||||
latin_b = u"ƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿ" \
|
||||
u"ǀǁǂǃDŽDždžLJLjljNJNjnjǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩǪǫǬǭǮǯǰDZDzdzǴǵǶǷǸǹǺǻǼǽǾǿ" \
|
||||
u"ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ" \
|
||||
u"ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ"
|
||||
ipa_ext = u"ɐɑɒɓɔɕɖɗɘəɚɛɜɝɞɟɠɡɢɣɤɥɦɧɨɩɪɫɬɭɮɯɰɱɲɳɴɵɶɷɸɹɺɻɼɽɾɿʀʁʂʃʄʅʆʇʈʉʊʋʌʍʎʏ" \
|
||||
u"ʐʑʒʓʔʕʖʗʘʙʚʛʜʝʞʟʠʡʢʣʤʥʦʧʨʩʪʫʬʭʮʯ"
|
||||
phonetic = u"ᴀᴁᴂᴃᴄᴅᴆᴇᴈᴉᴊᴋᴌᴍᴎᴏᴐᴑᴒᴓᴔᴕᴖᴗᴘᴙᴚᴛᴜᴝᴞᴟᴠᴡᴢᴣᴤᴥᴦᴧᴨᴩᴪᴫᴬᴭᴮᴯᴰᴱᴲᴳᴴᴵᴶᴷᴸᴹᴺᴻᴼᴽᴾᴿ" \
|
||||
u"ᵀᵁᵂᵃᵄᵅᵆᵇᵈᵉᵊᵋᵌᵍᵎᵏᵐᵑᵒᵓᵔᵕᵖᵗᵘᵙᵚᵛᵜᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᵫᵬᵭᵮᵯᵰᵱᵲᵳᵴᵵᵶᵷᵸᵹᵺᵻᵼᵽᵾᵿ" \
|
||||
u"ᶀᶁᶂᶃᶄᶅᶆᶇᶈᶉᶊᶋᶌᶍᶎᶏᶐᶑᶒᶓᶔᶕᶖᶗᶘᶙᶚᶛᶜᶝᶞᶟᶠᶡᶢᶣᶤᶥᶦᶧᶨᶩᶪᶫᶬᶭᶮᶯᶰᶱᶲᶳᶴᶵᶶᶷᶸᶹᶺᶻᶼᶽᶾᶿ"
|
||||
latin_ext_add = u"ḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḤḦḧḨḩḪḫḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿ" \
|
||||
u"ṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗṘṙṚṛṜṝṞṟṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿ" \
|
||||
u"ẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕẖẗẘẙẚẛẜẝẞẟẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾế" \
|
||||
u"ỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹỺỻỼỽỾỿ"
|
||||
letter_like = u"℀℁ℂ℃℄℅℆ℇ℈℉ℊℋℌℍℎℏℐℑℒℓ℔ℕ№℗℘ℙℚℛℜℝ℞℟℠℡™℣ℤ℥Ω℧ℨ℩KÅℬℭ℮ℯℰℱℲℳℴℵℶℷℸℹ℺℻ℼℽℾℿ" \
|
||||
u"⅀⅁⅂⅃⅄ⅅⅆⅇⅈⅉ⅊⅋⅌⅍ⅎ⅏"
|
||||
enclosed = u"⒐⒑⒒⒓⒔⒕⒖⒗⒘⒙⒚⒛⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" \
|
||||
u"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ⓪⓫⓬⓭⓮⓯"
|
||||
|
||||
print "The following lines show the coverage of Latin characters conversion to ascii."
|
||||
print "Underscores are characters which currently do not have an ASCII representation."
|
||||
print
|
||||
print "latin-1: ",util.textencoding.replace_non_ascii(latin_1)
|
||||
print "latin-1: ",util.textencoding.replace_non_ascii(latin_1)
|
||||
print "latin-a: ",util.textencoding.replace_non_ascii(latin_a)
|
||||
print "latin-b: ",util.textencoding.replace_non_ascii(latin_b)
|
||||
print "ipa-ext: ",util.textencoding.replace_non_ascii(ipa_ext)
|
||||
print "phonetic: ",util.textencoding.replace_non_ascii(phonetic)
|
||||
print "latin-ext-add: ",util.textencoding.replace_non_ascii(latin_ext_add)
|
||||
print "letter-like: ",util.textencoding.replace_non_ascii(letter_like)
|
||||
print "enclosed: ",util.textencoding.replace_non_ascii(enclosed)
|
||||
print
|
||||
|
||||
Reference in New Issue
Block a user