PICARD-2339: Simplify clustering algorithm

The existing code was using the Levenshtein distance to calculate similarity, which caused a O(n^2) performance. But since only exactly similar matches where used (similarity threshold 1.0) this was not necessary.

This new implementation uses simple comparison for string equality and performs in O(n).
This commit is contained in:
Philipp Wolfer
2021-11-23 08:13:53 +01:00
parent 0ff35391be
commit bb48705357
3 changed files with 77 additions and 259 deletions

View File

@@ -24,25 +24,21 @@ from test.picardtestcase import PicardTestCase
from picard.cluster import (
Cluster,
ClusterDict,
tokenize,
)
from picard.file import File
class TokenizeTest(PicardTestCase):
def setUp(self):
super().setUp()
self.clusterdict = ClusterDict()
def test_tokenize(self):
token = self.clusterdict.tokenize("")
token = tokenize("")
self.assertEqual(token, "")
token = self.clusterdict.tokenize(" \t ")
token = tokenize(" \t ")
self.assertEqual(token, "")
token = self.clusterdict.tokenize(" A\tWord-test ")
token = tokenize(" A\tWord-test ")
self.assertEqual(token, "awordtest")
@@ -61,12 +57,12 @@ class ClusterTest(PicardTestCase):
return file
def assertClusterEqual(self, album, artist, files, cluster):
self.assertEqual(album, cluster[0])
self.assertEqual(artist, cluster[1])
self.assertEqual(set(files), set(cluster[2]))
self.assertEqual(album, cluster.title)
self.assertEqual(artist, cluster.artist)
self.assertEqual(set(files), set(cluster.files))
def test_cluster_none(self):
clusters = list(Cluster.cluster([], 1.0))
clusters = list(Cluster.cluster([]))
# No cluster is being created
self.assertEqual(0, len(clusters))
@@ -74,7 +70,7 @@ class ClusterTest(PicardTestCase):
files = [
self._create_file('album foo', 'artist foo'),
]
clusters = list(Cluster.cluster(files, 1.0))
clusters = list(Cluster.cluster(files))
# No cluster is being created for single files
self.assertEqual(0, len(clusters))
@@ -84,19 +80,19 @@ class ClusterTest(PicardTestCase):
self._create_file('album foo', 'artist foo'),
self._create_file('album foo', 'artist foo'),
]
clusters = list(Cluster.cluster(files, 1.0))
clusters = list(Cluster.cluster(files))
self.assertEqual(1, len(clusters))
self.assertClusterEqual('album foo', 'artist foo', files, clusters[0])
def test_cluster_multi(self):
files = [
self._create_file('album cluster1', 'artist bar'),
self._create_file('albumcluster2', 'artist foo'),
self._create_file('album cluster2', 'artist foo'),
self._create_file('album cluster1', 'artist foo'),
self._create_file('album cluster2', 'artist bar'),
self._create_file('albumcluster2', 'artist bar'),
self._create_file('album nocluster', 'artist bar'),
]
clusters = list(Cluster.cluster(files, 1.0))
clusters = list(Cluster.cluster(files))
self.assertEqual(2, len(clusters))
self.assertClusterEqual('album cluster1', 'artist bar', {files[0], files[2]}, clusters[0])
self.assertClusterEqual('album cluster2', 'artist foo', {files[1], files[3]}, clusters[1])
@@ -110,7 +106,7 @@ class ClusterTest(PicardTestCase):
self._create_file(None, None, 'nocluster/foo.ogg'),
self._create_file(None, None, 'album1/foo3.ogg'),
]
clusters = list(Cluster.cluster(files, 1.0))
clusters = list(Cluster.cluster(files))
self.assertEqual(2, len(clusters))
self.assertClusterEqual('album1', 'artist1', {files[0], files[2], files[5]}, clusters[0])
self.assertClusterEqual('album2', 'Various Artists', {files[1], files[3]}, clusters[1])
@@ -121,17 +117,17 @@ class ClusterTest(PicardTestCase):
self._create_file(None, None, 'foo2.ogg'),
self._create_file(None, None, 'foo3.ogg'),
]
clusters = list(Cluster.cluster(files, 1.0))
clusters = list(Cluster.cluster(files))
self.assertEqual(0, len(clusters))
def test_common_artist_name(self):
files = [
self._create_file('cluster1', 'artist1'),
self._create_file('cluster1', 'artist2'),
self._create_file('cluster1', 'artist 2'),
self._create_file('cluster1', 'artist1'),
self._create_file('cluster1', 'artist2'),
]
clusters = list(Cluster.cluster(files, 1.0))
self.assertEqual(1, len(clusters))
self.assertClusterEqual('cluster1', 'artist2', files, clusters[0])
# def test_common_artist_name(self):
# files = [
# self._create_file('cluster 1', 'artist 1'),
# self._create_file('cluster 1', 'artist 2'),
# self._create_file('cluster 1', 'artist2'),
# self._create_file('cluster 1', 'artist 1'),
# self._create_file('cluster 1', 'artist 2'),
# ]
# clusters = list(Cluster.cluster(files))
# self.assertEqual(1, len(clusters))
# self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])