mirror of
https://github.com/fergalmoran/picard.git
synced 2026-01-06 16:44:06 +00:00
Address various comments by Philipp Wolfer ...
... based on actual knowledge of how characters are used in practice. low quotation mark -> normal quotation mark rather than comma oi should be gha -> gh rather than oi ideographic punctuation, multiplication / division and middle dot are not punctuation
This commit is contained in:
@@ -108,11 +108,11 @@ _simplify_punctuation = {
|
||||
u"\u0140": u"l", # LATIN SMALL LETTER L WITH MIDDLE DOT (compat)
|
||||
u"\u2018": u"'", # LEFT SINGLE QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u2019": u"'", # RIGHT SINGLE QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201A": u",", # SINGLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201A": u"'", # SINGLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201B": u"'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201C": u"\"", # LEFT DOUBLE QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201D": u"\"", # RIGHT DOUBLE QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201E": u",,", # DOUBLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201E": u"\"", # DOUBLE LOW-9 QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u201F": u"\"", # DOUBLE HIGH-REVERSED-9 QUOTATION MARK (from ‹character-fallback›)
|
||||
u"\u2032": u"'", # PRIME
|
||||
u"\u2033": u"\"", # DOUBLE PRIME
|
||||
@@ -137,8 +137,6 @@ _simplify_punctuation = {
|
||||
u"\u2045": u"[", # LEFT SQUARE BRACKET WITH QUILL
|
||||
u"\u2046": u"]", # RIGHT SQUARE BRACKET WITH QUILL
|
||||
u"\u204E": u"*", # LOW ASTERISK
|
||||
u"\u3001": u",", # IDEOGRAPHIC COMMA
|
||||
u"\u3002": u".", # IDEOGRAPHIC FULL STOP
|
||||
u"\u3008": u"<", # LEFT ANGLE BRACKET
|
||||
u"\u3009": u">", # RIGHT ANGLE BRACKET
|
||||
u"\u300A": u"<<", # LEFT DOUBLE ANGLE BRACKET
|
||||
@@ -164,8 +162,6 @@ _simplify_punctuation = {
|
||||
u"\uFF60": u"))", # FULLWIDTH RIGHT WHITE PARENTHESIS (compat)(from ‹character-fallback›)
|
||||
u"\uFF61": u".", # HALFWIDTH IDEOGRAPHIC FULL STOP (compat)
|
||||
u"\uFF64": u",", # HALFWIDTH IDEOGRAPHIC COMMA (compat)
|
||||
u"\u00D7": u"x", # MULTIPLICATION SIGN
|
||||
u"\u00F7": u"/", # DIVISION SIGN
|
||||
u"\u2212": u"-", # MINUS SIGN (from ‹character-fallback›)
|
||||
u"\u2215": u"/", # DIVISION SLASH (from ‹character-fallback›)
|
||||
u"\u2216": u"\\", # SET MINUS (from ‹character-fallback›)
|
||||
@@ -175,7 +171,6 @@ _simplify_punctuation = {
|
||||
u"\u226B": u">>", # MUCH GREATER-THAN
|
||||
u"\u2985": u"((", # LEFT WHITE PARENTHESIS
|
||||
u"\u2986": u"))", # RIGHT WHITE PARENTHESIS
|
||||
u"\u00B7": u".", # MIDDLE DOT
|
||||
u"\u200B": u"", # Zero Width Space
|
||||
}
|
||||
_re_simplify_punctuation = _re_any(_simplify_punctuation.keys())
|
||||
@@ -231,8 +226,8 @@ _simplify_combinations = {
|
||||
u"\u019A": u"l", # LATIN SMALL LETTER L WITH BAR
|
||||
u"\u019D": u"N", # LATIN CAPITAL LETTER N WITH LEFT HOOK
|
||||
u"\u019E": u"n", # LATIN SMALL LETTER N WITH LONG RIGHT LEG
|
||||
u"\u01A2": u"OI", # LATIN CAPITAL LETTER OI
|
||||
u"\u01A3": u"oi", # LATIN SMALL LETTER OI
|
||||
u"\u01A2": u"GH", # LATIN CAPITAL LETTER GHA (see http://unicode.org/notes/tn27/)
|
||||
u"\u01A3": u"gh", # LATIN SMALL LETTER GHA (see http://unicode.org/notes/tn27/)
|
||||
u"\u01A4": u"P", # LATIN CAPITAL LETTER P WITH HOOK
|
||||
u"\u01A5": u"p", # LATIN SMALL LETTER P WITH HOOK
|
||||
u"\u01AB": u"t", # LATIN SMALL LETTER T WITH PALATAL HOOK
|
||||
@@ -399,6 +394,11 @@ _simplify_combinations = {
|
||||
u"\u215D": u" 5/8", # VULGAR FRACTION FIVE EIGHTHS (from ‹character-fallback›)
|
||||
u"\u215E": u" 7/8", # VULGAR FRACTION SEVEN EIGHTHS (from ‹character-fallback›)
|
||||
u"\u215F": u" 1/", # FRACTION NUMERATOR ONE (from ‹character-fallback›)
|
||||
u"\u3001": u",", # IDEOGRAPHIC COMMA
|
||||
u"\u3002": u".", # IDEOGRAPHIC FULL STOP
|
||||
u"\u00D7": u"x", # MULTIPLICATION SIGN
|
||||
u"\u00F7": u"/", # DIVISION SIGN
|
||||
u"\u00B7": u".", # MIDDLE DOT
|
||||
u"\u1E9F": u"dd", # LATIN SMALL LETTER DELTA
|
||||
u"\u0184": u"H", # LATIN CAPITAL LETTER TONE SIX
|
||||
u"\u0185": u"h", # LATIN SMALL LETTER TONE SIX
|
||||
|
||||
@@ -68,11 +68,11 @@ compatibility_to += u"ACDEJKMOPTUVWZ0 "
|
||||
punctuation_from = (
|
||||
u"\u2018\u2019\u201A\u201B\u201C\u201D\u201E\u201F\u2032\u301D" # ‘’‚‛“”„‟′〝
|
||||
u"\u301E\u00AB\u00BB\u2039\u203A\u00AD\u2010\u2012\u2013\u2014" # 〞«»‹›\u00AD‐‒–—
|
||||
u"\u2015\u2016\u2044\u2045\u2046\u204E\u3001\u3002\u3008\u3009" # ―‖⁄⁅⁆⁎、。〈〉
|
||||
u"\u300A\u300B\u3014\u3015\u3018\u3019\u301A\u301B\u00D7\u00F7" # 《》〔〕〘〙〚〛×÷
|
||||
u"\u2212\u2215\u2216\u2223\u2225\u226A\u226B\u2985\u2986\u00B7" # −∕∖∣∥≪≫⦅⦆·
|
||||
u"\u2015\u2016\u2044\u2045\u2046\u204E\u3008\u3009\u300A\u300B" # ―‖⁄⁅⁆⁎〈〉《》
|
||||
u"\u3014\u3015\u3018\u3019\u301A\u301B\u2212\u2215\u2216\u2223" # 〔〕〘〙〚〛−∕∖∣
|
||||
u"\u2225\u226A\u226B\u2985\u2986\u200B" # ∥≪≫⦅⦆·
|
||||
)
|
||||
punctuation_to = u"'','\"\",,\"'\"\"<<>><>-----||/[]*,.<><<>>[][][]x/-/\\|||<<>>(())."
|
||||
punctuation_to = u"''''\"\"\"\"'\"\"<<>><>-----||/[]*<><<>>[][][]-/\\|||<<>>(())"
|
||||
combinations_from = (
|
||||
u"\u00C6\u00D0\u00D8\u00DE\u00DF\u00E6\u00F0\u00F8\u00FE\u0110" # ÆÐØÞßæðøþĐ
|
||||
u"\u0111\u0126\u0127\u0131\u0138\u0141\u0142\u014A\u014B\u0152" # đĦħıĸŁłŊŋŒ
|
||||
@@ -92,14 +92,15 @@ combinations_from = (
|
||||
u"\u1D83\u1D84\u1D85\u1D86\u1D87\u1D88\u1D89\u1D8A\u1D8C\u1D8D" # ᶃᶄᶅᶆᶇᶈᶉᶊᶌᶍ
|
||||
u"\u1D8E\u1D8F\u1D91\u1D92\u1D93\u1D96\u1D99\u1E9C\u1E9D\u1E9E" # ᶎᶏᶑᶒᶓᶖᶙẜẝẞ
|
||||
u"\u1EFA\u1EFB\u1EFC\u1EFD\u1EFE\u1EFF\u00A9\u00AE\u20A0\u20A2" # ỺỻỼỽỾỿ©®₠₢
|
||||
u"\u20A3\u20A4\u20A7\u20BA\u20B9\u211E" # ₣₤₧₺₹℞
|
||||
u"\u20A3\u20A4\u20A7\u20BA\u20B9\u211E\u3001\u3002\u00D7\u00F7" # ₣₤₧₺₹℞、。×÷
|
||||
u"\u00B7\u1E9F\u0184\u0185\u01BE" # ·ẟƄƅƾ
|
||||
)
|
||||
combinations_to = (
|
||||
u"AEDOETHssaedoethDdHhiqLlNnOEoeTtbBBbCcDDDdEFfGhvII"
|
||||
U"KklNnOIoiPptTtTVYyZzGgdZzlntjdbqpACcLTszBUEe"
|
||||
u"KklNnGHghPptTtTVYyZzGgdZzlntjdbqpACcLTszBUEe"
|
||||
u"JjRrYybcddejggGhhiIlllmnnNrrrRstuvYzzBGH"
|
||||
u"jLqdzdztslslzBDLuebdfmnprrstzthIpUbdfgklmnprsvx"
|
||||
u"zadeeiussSSLLllVvYy(C)(R)CECrFr.L.PtsTLRsRx"
|
||||
u"zadeeiussSSLLllVvYy(C)(R)CECrFr.L.PtsTLRsRx,.x/.ddHhts"
|
||||
)
|
||||
ascii = u" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user