diff --git a/picard/metadata.py b/picard/metadata.py index d9be5c2cf..2b617cbfb 100644 --- a/picard/metadata.py +++ b/picard/metadata.py @@ -19,7 +19,7 @@ from PyQt4 import QtCore from copy import copy -from picard.similarity import similarity +from picard.similarity import similarity, raw_similarity class Metadata(QtCore.QObject): diff --git a/picard/similarity.py b/picard/similarity.py index 0f246a3fe..1d45246af 100644 --- a/picard/similarity.py +++ b/picard/similarity.py @@ -20,6 +20,7 @@ import math import re + def distance(a,b): """Calculates the Levenshtein distance between a and b.""" @@ -40,22 +41,24 @@ def distance(a,b): current[j] = min(add, delete, change) return current[n] - + + def boost(sim): sim2 = sim sim = min(1, (math.exp(sim) - 1) / (math.e - 1.2)) sim = math.pow(sim, 0.8) sim = max(sim2, sim) return sim - + + def raw_similarity(a, b): + """Calculates raw similarity of strings ``a`` and ``b``.""" if not a or not b: return 0.0 - # string distance => <0,1> similarity sim = 1 - distance(a, b) * 1.0 / max(len(a), len(b)) - # human brain doesn't think linear! :) return boost(sim) - + + _split_re = re.compile("\W", re.UNICODE) _stop_words = ["the", "--", "in", "of", "a", "feat"] @@ -69,8 +72,9 @@ _replace_words = { "disc 7": "CD7", "disc 8": "CD8", } - + def similarity(a1, b1): + """Calculates "smart" similarity of strings ``a`` and ``b``.""" a2 = a1 b2 = b1 for w, r in _replace_words.items(): @@ -78,16 +82,14 @@ def similarity(a1, b1): def flt(a): def flt(a): return a not in _stop_words and len(a) > 1 -# print _split_re.split(a.lower()) return u" ".join(filter(flt, _split_re.split(a.lower()))) a2 = flt(a2) b2 = flt(b2) sim1 = raw_similarity(a1, b1) - sim2 = raw_similarity(a2, b2) - #print a2, b2 - #print sim1, sim2 - # just to not have 100% matches on e.g. 'ABC' vs 'abc' - sim = sim1 * 0.1 + sim2 * 0.9 - #sim = sim2 + if a2 or b2: + sim2 = raw_similarity(a2, b2) + sim = sim1 * 0.1 + sim2 * 0.9 + else: + sim = sim1 return sim diff --git a/picard/util.py b/picard/util.py index e6e166eae..a7d3de33b 100644 --- a/picard/util.py +++ b/picard/util.py @@ -18,11 +18,12 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -import sys import os.path +import re +import sys +import unicodedata from PyQt4 import QtCore - class LockableObject(QtCore.QObject): """Read/write lockable object.""" @@ -88,3 +89,26 @@ def sanitize_date(datestr): date.append(num) return ("", "%04d", "%04d-%02d", "%04d-%02d-%02d")[len(date)] % tuple(date) +_re_latin_letter = re.compile(r"^(LATIN [A-Z]+ LETTER [A-Z]+) WITH") +def unaccent(string): + """Remove accents ``string``.""" + result = [] + for char in string: + name = unicodedata.name(char) + match = _re_latin_letter.search(name) + if match: + char = unicodedata.lookup(match.group(1)) + result.append(char) + return "".join(result) + +_re_non_ascii = re.compile(r'[^\x00-\x7F]', re.UNICODE) +def replace_non_ascii(string, repl="_"): + """Replace non-ASCII characters from ``string`` by ``repl``.""" + return _re_non_ascii.sub(repl, string) + +_re_win32_incompat = re.compile(r'[\\"*/:<>?|]', re.UNICODE) +def replace_win32_incompat(string, repl="_"): + """Replace win32 filename incompatible characters from ``string`` by + ``repl``.""" + return _re_win32_incompat.sub(repl, string) + diff --git a/test/test_similarity.py b/test/test_similarity.py index a7b644966..fbfafb555 100644 --- a/test/test_similarity.py +++ b/test/test_similarity.py @@ -1,12 +1,10 @@ import unittest -from picard.similarity import similarity +from picard.similarity import similarity, raw_similarity class SimilarityTest(unittest.TestCase): - def testTagz(self): - self.failUnless(True) - pass + def test_correct(self): + self.failUnlessEqual(similarity("K!", "K!"), 1.0) + self.failUnlessEqual(similarity("BBB", "AAA"), 0.0) + self.failUnlessAlmostEqual(similarity("ABC", "ABB"), 0.7, 1) - def test_tagz2(self): - self.failUnless(True) - pass diff --git a/test/test_utils.py b/test/test_utils.py new file mode 100644 index 000000000..9b7f3ce67 --- /dev/null +++ b/test/test_utils.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +import unittest +from picard import util + + +class UnaccentTest(unittest.TestCase): + + def test_correct(self): + self.failUnlessEqual(util.unaccent(u"Lukáš"), u"Lukas") + self.failUnlessEqual(util.unaccent(u"Björk"), u"Bjork") + self.failUnlessEqual(util.unaccent(u"Trentemøller"), u"Trentemoller") + self.failUnlessEqual(util.unaccent(u"小室哲哉"), u"小室哲哉") + + def test_incorrect(self): + self.failIfEqual(util.unaccent(u"Björk"), u"Björk") + self.failIfEqual(util.unaccent(u"小室哲哉"), u"Tetsuya Komuro") + + +class ReplaceNonAsciiTest(unittest.TestCase): + + def test_correct(self): + self.failUnlessEqual(util.replace_non_ascii(u"Lukáš"), u"Luk__") + self.failUnlessEqual(util.replace_non_ascii(u"Björk"), u"Bj_rk") + self.failUnlessEqual(util.replace_non_ascii(u"Trentemøller"), u"Trentem_ller") + self.failUnlessEqual(util.replace_non_ascii(u"小室哲哉"), u"____") + + def test_incorrect(self): + self.failIfEqual(util.replace_non_ascii(u"Lukáš"), u"Lukáš") + self.failIfEqual(util.replace_non_ascii(u"Lukáš"), u"Luk____") + + +class ReplaceWin32IncompatTest(unittest.TestCase): + + def test_correct(self): + self.failUnlessEqual(util.replace_win32_incompat("c:\\test\\te\"st2"), + "c__test_te_st2") + + def test_incorrect(self): + self.failIfEqual(util.replace_win32_incompat("c:\\test\\te\"st2"), + "c:\\test\\te\"st2") + + +class SanitizeDateTest(unittest.TestCase): + + def test_correct(self): + self.failUnlessEqual(util.sanitize_date("2006--"), "2006") + self.failUnlessEqual(util.sanitize_date("2006--02"), "2006") + self.failUnlessEqual(util.sanitize_date("2006 "), "2006") + self.failUnlessEqual(util.sanitize_date("2006 02"), "") + self.failUnlessEqual(util.sanitize_date("2006.02"), "") + self.failUnlessEqual(util.sanitize_date("2006-02"), "2006-02") + + def test_incorrect(self): + self.failIfEqual(util.sanitize_date("2006--02"), "2006-02") + self.failIfEqual(util.sanitize_date("2006.03.02"), "2006-03-02") +