Address various comments by Philipp Wolfer ...

... based on actual knowledge of how characters are used in practice.

low quotation mark -> normal quotation mark rather than comma
oi should be gha -> gh rather than oi
ideographic punctuation, multiplication / division and middle dot are not punctuation
This commit is contained in:
Sophist
2014-03-28 10:42:47 +00:00
parent 110441f91e
commit e406a554bc
2 changed files with 17 additions and 16 deletions

View File

@@ -108,11 +108,11 @@ _simplify_punctuation = {
u"\u0140": u"l", # LATIN SMALL LETTER L WITH MIDDLE DOT (compat)
u"\u2018": u"'", # LEFT SINGLE QUOTATION MARK (from character-fallback)
u"\u2019": u"'", # RIGHT SINGLE QUOTATION MARK (from character-fallback)
u"\u201A": u",", # SINGLE LOW-9 QUOTATION MARK (from character-fallback)
u"\u201A": u"'", # SINGLE LOW-9 QUOTATION MARK (from character-fallback)
u"\u201B": u"'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK (from character-fallback)
u"\u201C": u"\"", # LEFT DOUBLE QUOTATION MARK (from character-fallback)
u"\u201D": u"\"", # RIGHT DOUBLE QUOTATION MARK (from character-fallback)
u"\u201E": u",,", # DOUBLE LOW-9 QUOTATION MARK (from character-fallback)
u"\u201E": u"\"", # DOUBLE LOW-9 QUOTATION MARK (from character-fallback)
u"\u201F": u"\"", # DOUBLE HIGH-REVERSED-9 QUOTATION MARK (from character-fallback)
u"\u2032": u"'", # PRIME
u"\u2033": u"\"", # DOUBLE PRIME
@@ -137,8 +137,6 @@ _simplify_punctuation = {
u"\u2045": u"[", # LEFT SQUARE BRACKET WITH QUILL
u"\u2046": u"]", # RIGHT SQUARE BRACKET WITH QUILL
u"\u204E": u"*", # LOW ASTERISK
u"\u3001": u",", # IDEOGRAPHIC COMMA
u"\u3002": u".", # IDEOGRAPHIC FULL STOP
u"\u3008": u"<", # LEFT ANGLE BRACKET
u"\u3009": u">", # RIGHT ANGLE BRACKET
u"\u300A": u"<<", # LEFT DOUBLE ANGLE BRACKET
@@ -164,8 +162,6 @@ _simplify_punctuation = {
u"\uFF60": u"))", # FULLWIDTH RIGHT WHITE PARENTHESIS (compat)(from character-fallback)
u"\uFF61": u".", # HALFWIDTH IDEOGRAPHIC FULL STOP (compat)
u"\uFF64": u",", # HALFWIDTH IDEOGRAPHIC COMMA (compat)
u"\u00D7": u"x", # MULTIPLICATION SIGN
u"\u00F7": u"/", # DIVISION SIGN
u"\u2212": u"-", # MINUS SIGN (from character-fallback)
u"\u2215": u"/", # DIVISION SLASH (from character-fallback)
u"\u2216": u"\\", # SET MINUS (from character-fallback)
@@ -175,7 +171,6 @@ _simplify_punctuation = {
u"\u226B": u">>", # MUCH GREATER-THAN
u"\u2985": u"((", # LEFT WHITE PARENTHESIS
u"\u2986": u"))", # RIGHT WHITE PARENTHESIS
u"\u00B7": u".", # MIDDLE DOT
u"\u200B": u"", # Zero Width Space
}
_re_simplify_punctuation = _re_any(_simplify_punctuation.keys())
@@ -231,8 +226,8 @@ _simplify_combinations = {
u"\u019A": u"l", # LATIN SMALL LETTER L WITH BAR
u"\u019D": u"N", # LATIN CAPITAL LETTER N WITH LEFT HOOK
u"\u019E": u"n", # LATIN SMALL LETTER N WITH LONG RIGHT LEG
u"\u01A2": u"OI", # LATIN CAPITAL LETTER OI
u"\u01A3": u"oi", # LATIN SMALL LETTER OI
u"\u01A2": u"GH", # LATIN CAPITAL LETTER GHA (see http://unicode.org/notes/tn27/)
u"\u01A3": u"gh", # LATIN SMALL LETTER GHA (see http://unicode.org/notes/tn27/)
u"\u01A4": u"P", # LATIN CAPITAL LETTER P WITH HOOK
u"\u01A5": u"p", # LATIN SMALL LETTER P WITH HOOK
u"\u01AB": u"t", # LATIN SMALL LETTER T WITH PALATAL HOOK
@@ -399,6 +394,11 @@ _simplify_combinations = {
u"\u215D": u" 5/8", # VULGAR FRACTION FIVE EIGHTHS (from character-fallback)
u"\u215E": u" 7/8", # VULGAR FRACTION SEVEN EIGHTHS (from character-fallback)
u"\u215F": u" 1/", # FRACTION NUMERATOR ONE (from character-fallback)
u"\u3001": u",", # IDEOGRAPHIC COMMA
u"\u3002": u".", # IDEOGRAPHIC FULL STOP
u"\u00D7": u"x", # MULTIPLICATION SIGN
u"\u00F7": u"/", # DIVISION SIGN
u"\u00B7": u".", # MIDDLE DOT
u"\u1E9F": u"dd", # LATIN SMALL LETTER DELTA
u"\u0184": u"H", # LATIN CAPITAL LETTER TONE SIX
u"\u0185": u"h", # LATIN SMALL LETTER TONE SIX

View File

@@ -68,11 +68,11 @@ compatibility_to += u"ACDEJKMOPTUVWZ0 "
punctuation_from = (
u"\u2018\u2019\u201A\u201B\u201C\u201D\u201E\u201F\u2032\u301D" # ‘’‚‛“”„‟′〝
u"\u301E\u00AB\u00BB\u2039\u203A\u00AD\u2010\u2012\u2013\u2014" # 〞«»‹›\u00AD
u"\u2015\u2016\u2044\u2045\u2046\u204E\u3001\u3002\u3008\u3009" # ―‖⁄⁅⁆⁎、。〈〉
u"\u300A\u300B\u3014\u3015\u3018\u3019\u301A\u301B\u00D7\u00F7" # 《》〔〕〘〙〚〛×÷
u"\u2212\u2215\u2216\u2223\u2225\u226A\u226B\u2985\u2986\u00B7" # ∥≪≫⦅⦆·
u"\u2015\u2016\u2044\u2045\u2046\u204E\u3008\u3009\u300A\u300B" # ―‖⁄⁅⁆⁎〈〉《》
u"\u3014\u3015\u3018\u3019\u301A\u301B\u2212\u2215\u2216\u2223" # 〔〕〘〙〚〛
u"\u2225\u226A\u226B\u2985\u2986\u200B" # ∥≪≫⦅⦆·
)
punctuation_to = u"'','\"\",,\"'\"\"<<>><>-----||/[]*,.<><<>>[][][]x/-/\\|||<<>>(())."
punctuation_to = u"''''\"\"\"\"'\"\"<<>><>-----||/[]*<><<>>[][][]-/\\|||<<>>(())"
combinations_from = (
u"\u00C6\u00D0\u00D8\u00DE\u00DF\u00E6\u00F0\u00F8\u00FE\u0110" # ÆÐØÞßæðøþĐ
u"\u0111\u0126\u0127\u0131\u0138\u0141\u0142\u014A\u014B\u0152" # đĦħıĸŁłŊŋŒ
@@ -92,14 +92,15 @@ combinations_from = (
u"\u1D83\u1D84\u1D85\u1D86\u1D87\u1D88\u1D89\u1D8A\u1D8C\u1D8D" # ᶃᶄᶅᶆᶇᶈᶉᶊᶌᶍ
u"\u1D8E\u1D8F\u1D91\u1D92\u1D93\u1D96\u1D99\u1E9C\u1E9D\u1E9E" # ᶎᶏᶑᶒᶓᶖᶙẜẝẞ
u"\u1EFA\u1EFB\u1EFC\u1EFD\u1EFE\u1EFF\u00A9\u00AE\u20A0\u20A2" # ỺỻỼỽỾỿ©®₠₢
u"\u20A3\u20A4\u20A7\u20BA\u20B9\u211E" # ₣₤₧₺₹℞
u"\u20A3\u20A4\u20A7\u20BA\u20B9\u211E\u3001\u3002\u00D7\u00F7" # ₣₤₧₺₹℞、。×÷
u"\u00B7\u1E9F\u0184\u0185\u01BE" # ·ẟƄƅƾ
)
combinations_to = (
u"AEDOETHssaedoethDdHhiqLlNnOEoeTtbBBbCcDDDdEFfGhvII"
U"KklNnOIoiPptTtTVYyZzGgdZzlntjdbqpACcLTszBUEe"
u"KklNnGHghPptTtTVYyZzGgdZzlntjdbqpACcLTszBUEe"
u"JjRrYybcddejggGhhiIlllmnnNrrrRstuvYzzBGH"
u"jLqdzdztslslzBDLuebdfmnprrstzthIpUbdfgklmnprsvx"
u"zadeeiussSSLLllVvYy(C)(R)CECrFr.L.PtsTLRsRx"
u"zadeeiussSSLLllVvYy(C)(R)CECrFr.L.PtsTLRsRx,.x/.ddHhts"
)
ascii = u" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"