PICARD-2339: Ensure clustering uses most common spelling of the same artist

This restores previous behavior, where a cluster primary artist is based on the tokenized artist name, but then the most common real spelling is being used.
2026-01-04 15:43:58 +00:00 · 2021-11-23 09:48:31 +01:00
parent bb48705357
commit 2aefcd051a
2 changed files with 51 additions and 17 deletions
--- a/picard/cluster.py
+++ b/picard/cluster.py
@@ -420,21 +420,25 @@ class ClusterList(list, Item):
 class FileCluster:
    def __init__(self):
        self.files = []
-        self.artists = defaultdict(lambda: 0)
-        self.titles = defaultdict(lambda: 0)
+        self._artist_counts = defaultdict(lambda: 0)
+        self._artists = defaultdict(lambda: defaultdict(lambda: 0))
+        self._titles = defaultdict(lambda: 0)

    def add(self, album, artist, file):
        self.files.append(file)
-        self.artists[artist] += 1
-        self.titles[album] += 1
+        self._artist_counts[tokenize(artist)] += 1
+        self._artists[tokenize(artist)][artist] += 1
+        self._titles[album] += 1

    @property
    def artist(self):
-        return max(self.artists.items(), key=itemgetter(1))[0]
+        tokenized_artist = max(self._artist_counts.items(), key=itemgetter(1))[0]
+        return max(self._artists[tokenized_artist].items(), key=itemgetter(1))[0]

    @property
    def title(self):
-        return max(self.titles.items(), key=itemgetter(1))[0]
+        # Find the most common title
+        return max(self._titles.items(), key=itemgetter(1))[0]


 _re_non_alphanum = re.compile(r'\W', re.UNICODE)
--- a/test/test_clustering.py
+++ b/test/test_clustering.py
@@ -24,6 +24,7 @@ from test.picardtestcase import PicardTestCase

 from picard.cluster import (
    Cluster,
+    FileCluster,
    tokenize,
 )
 from picard.file import File
@@ -120,14 +121,43 @@ class ClusterTest(PicardTestCase):
        clusters = list(Cluster.cluster(files))
        self.assertEqual(0, len(clusters))

-    # def test_common_artist_name(self):
-    #     files = [
-    #         self._create_file('cluster 1', 'artist 1'),
-    #         self._create_file('cluster 1', 'artist 2'),
-    #         self._create_file('cluster 1', 'artist2'),
-    #         self._create_file('cluster 1', 'artist 1'),
-    #         self._create_file('cluster 1', 'artist 2'),
-    #     ]
-    #     clusters = list(Cluster.cluster(files))
-    #     self.assertEqual(1, len(clusters))
-    #     self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])
+    def test_common_artist_name(self):
+        files = [
+            self._create_file('cluster 1', 'artist 1'),
+            self._create_file('cluster 1', 'artist 2'),
+            self._create_file('cluster 1', 'artist2'),
+            self._create_file('cluster 1', 'artist 1'),
+            self._create_file('cluster 1', 'artist 2'),
+        ]
+        clusters = list(Cluster.cluster(files))
+        self.assertEqual(1, len(clusters))
+        self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])
+
+
+class FileClusterTest(PicardTestCase):
+
+    def test_single(self):
+        file = File('foo')
+        fc = FileCluster()
+        fc.add('album 1', 'artist 1', file)
+        self.assertEqual('album 1', fc.title)
+        self.assertEqual('artist 1', fc.artist)
+        self.assertEqual([file], fc.files)
+
+    def test_multi(self):
+        files = [
+            File('foo1'),
+            File('foo2'),
+            File('foo3'),
+            File('foo4'),
+            File('foo5'),
+        ]
+        fc = FileCluster()
+        fc.add('album 1', 'artist1', files[0])
+        fc.add('Album 1', 'artist 2', files[1])
+        fc.add('album\t1', 'Artist 1', files[2])
+        fc.add('Album 1', 'Artist 2', files[3])
+        fc.add('album 2', 'Artist 1', files[4])
+        self.assertEqual('Album 1', fc.title)
+        self.assertEqual('Artist 1', fc.artist)
+        self.assertEqual(files, fc.files)