* Replace custom Levenshtein distance implementation with difflib.SequenceMatcher

* Better files->album matching.
This commit is contained in:
Lukáš Lalinský
2006-09-18 21:26:37 +02:00
parent 8486dbd5e0
commit 164904dd7a
6 changed files with 62 additions and 75 deletions

View File

@@ -159,7 +159,7 @@ class Album(DataObject):
sim = file.orig_metadata.compare(track.metadata)
if sim > bestMatch[0]:
bestMatch = sim, track
if bestMatch[1]:
file.move_to_track(bestMatch[1])

View File

@@ -41,14 +41,11 @@ class Metadata(QtCore.QObject):
parts = []
tags = {
"musicbrainz_trackid": 10,
"musicbrainz_artistid": 10,
"musicbrainz_albumid": 10,
"~#length": 16,
"title": 14,
"artist": 8,
"album": 10,
"tracknumber": 12,
"title": 20,
"artist": 6,
"album": 12,
"tracknumber": 5,
}
identical = [
@@ -60,25 +57,25 @@ class Metadata(QtCore.QObject):
"discnumber",
"totaldiscs",
]
for tag in self.keys():
if tag not in tags and not tag.startswith("~"):
tags[tag] = 1
for tag in other.keys():
if tag not in tags and not tag.startswith("~"):
tags[tag] = 1
#for tag in self.keys():
# if tag not in tags and not tag.startswith("~"):
# tags[tag] = 1
#for tag in other.keys():
# if tag not in tags and not tag.startswith("~"):
# tags[tag] = 1
for tag, weight in tags.items():
if self[tag] and other[tag]:
if tag in identical:
sim = 1.0 - abs(cmp(self[tag], other[tag]))
elif tag in ["~#length"]:
elif tag == "~#length":
sim = 1.0 - min(abs(self[tag] - other[tag]), 30000) / 30000.0
else:
sim = similarity(self[tag], other[tag])
parts.append((sim, weight))
total = reduce(lambda x, y: x + y[1], parts, 0.0)
return reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)

View File

@@ -19,45 +19,14 @@
import math
import re
def distance(a,b):
"""Calculates the Levenshtein distance between a and b."""
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a,b = b,a
n,m = m,n
current = range(n+1)
for i in range(1,m+1):
previous, current = current, [i]+[0]*n
for j in range(1,n+1):
add, delete = previous[j]+1, current[j-1]+1
change = previous[j-1]
if a[j-1] != b[i-1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]
def boost(sim):
sim2 = sim
sim = min(1, (math.exp(sim) - 1) / (math.e - 1.2))
sim = math.pow(sim, 0.8)
sim = max(sim2, sim)
return sim
from difflib import SequenceMatcher
from picard.util import unaccent
def raw_similarity(a, b):
"""Calculates raw similarity of strings ``a`` and ``b``."""
if not a or not b:
return 0.0
sim = 1 - distance(a, b) * 1.0 / max(len(a), len(b))
return boost(sim)
d = SequenceMatcher(None, a, b).ratio()
return d
_split_re = re.compile("\W", re.UNICODE)
_stop_words = ["the", "--", "in", "of", "a", "feat"]
@@ -73,19 +42,23 @@ _replace_words = {
"disc 8": "CD8",
}
def similarity(a1, b1):
"""Calculates "smart" similarity of strings ``a`` and ``b``."""
a2 = a1
b2 = b1
def normalize(string):
for w, r in _replace_words.items():
a2 = a2.replace(w, r)
b2 = b2.replace(w, r)
def flt(a):
def flt(a):
return a not in _stop_words and len(a) > 1
return u" ".join(filter(flt, _split_re.split(a.lower())))
a2 = flt(a2)
b2 = flt(b2)
string = string.replace(w, r)
string = string.lower()
string = " ".join(filter(lambda a: a not in _stop_words and len(a) > 1,
_split_re.split(string)))
string = unaccent(string)
return string
def similarity(a1, b1):
return raw_similarity(a1, b1)
"""Calculates "smart" similarity of strings ``a`` and ``b``."""
a2 = normalize(a1)
if a2:
b2 = normalize(b1)
else:
b2 = ""
sim1 = raw_similarity(a1, b1)
if a2 or b2:
sim2 = raw_similarity(a2, b2)

View File

@@ -96,20 +96,38 @@ class Tagger(QtGui.QApplication, ComponentManager, Component):
self.connect(self.window, QtCore.SIGNAL("addDirectory"), self.onAddDirectory)
self.connect(self.worker, QtCore.SIGNAL("statusBarMessage(const QString &)"), self.window.setStatusBarMessage)
self.connect(self.window, QtCore.SIGNAL("file_updated(int)"), QtCore.SIGNAL("file_updated(int)"))
self.worker.start()
self.browserIntegration.start()
def match_files_to_album(self, files, album):
matches = []
for file in files:
for track in album.tracks:
sim = track.metadata.compare(file.orig_metadata)
matches.append((sim, file, track))
matches.sort(reverse=True)
matched = []
for sim, file, track in matches:
if sim <= 0.5:
continue
if file in matched:
continue
if track.linked_file and track.linked_file.similarity > sim:
continue
file.move_to_track(track)
matched.append(file)
def exit(self):
self.browserIntegration.stop()
self.worker.stop()
def run(self):
self.window.show()
res = self.exec_()
self.exit()
return res
def setup_gettext(self, localeDir):
"""Setup locales, load translations, install gettext functions."""
if sys.platform == "win32":

View File

@@ -147,7 +147,7 @@ class BaseTreeView(QtGui.QTreeWidget):
mimeData.setData("application/picard.file-list", "\n".join(file_ids))
print "\n".join(file_ids)
return mimeData
def dropFiles(self, files, target):
# File -> Track
if isinstance(target, Track):
@@ -164,8 +164,7 @@ class BaseTreeView(QtGui.QTreeWidget):
file.move_to_cluster(target.cluster)
# File -> Album
elif isinstance(target, Album):
for file in files:
target.matchFile(file)
self.tagger.match_files_to_album(files, target)
def dropAlbums(self, albums, target):
# Album -> Cluster
@@ -289,7 +288,7 @@ class FileTreeView(BaseTreeView):
file.lock_for_read()
try:
metadata = file.metadata
metadata = file.orig_metadata
item.setText(0, metadata["title"])
item.setText(1, format_time(metadata.get("~#length", 0)))
item.setText(2, metadata["artist"])

View File

@@ -410,7 +410,7 @@ class MainWindow(QtGui.QMainWindow):
if obj.linked_file:
orig_metadata = obj.linked_file.orig_metadata
metadata = obj.linked_file.metadata
statusBar = obj.linked_file.filename
statusBar = "%s (%d%%)" % (obj.linked_file.filename, obj.linked_file.similarity * 100)
file = obj.linked_file
else:
orig_metadata = obj.metadata