PICARD-2339: Ensure clustering uses most common spelling of the same artist

This restores previous behavior, where a cluster primary artist is based on the tokenized artist name, but then the most common real spelling is being used.
This commit is contained in:
Philipp Wolfer
2021-11-23 09:48:31 +01:00
parent bb48705357
commit 2aefcd051a
2 changed files with 51 additions and 17 deletions

View File

@@ -420,21 +420,25 @@ class ClusterList(list, Item):
class FileCluster:
def __init__(self):
self.files = []
self.artists = defaultdict(lambda: 0)
self.titles = defaultdict(lambda: 0)
self._artist_counts = defaultdict(lambda: 0)
self._artists = defaultdict(lambda: defaultdict(lambda: 0))
self._titles = defaultdict(lambda: 0)
def add(self, album, artist, file):
self.files.append(file)
self.artists[artist] += 1
self.titles[album] += 1
self._artist_counts[tokenize(artist)] += 1
self._artists[tokenize(artist)][artist] += 1
self._titles[album] += 1
@property
def artist(self):
return max(self.artists.items(), key=itemgetter(1))[0]
tokenized_artist = max(self._artist_counts.items(), key=itemgetter(1))[0]
return max(self._artists[tokenized_artist].items(), key=itemgetter(1))[0]
@property
def title(self):
return max(self.titles.items(), key=itemgetter(1))[0]
# Find the most common title
return max(self._titles.items(), key=itemgetter(1))[0]
_re_non_alphanum = re.compile(r'\W', re.UNICODE)

View File

@@ -24,6 +24,7 @@ from test.picardtestcase import PicardTestCase
from picard.cluster import (
Cluster,
FileCluster,
tokenize,
)
from picard.file import File
@@ -120,14 +121,43 @@ class ClusterTest(PicardTestCase):
clusters = list(Cluster.cluster(files))
self.assertEqual(0, len(clusters))
# def test_common_artist_name(self):
# files = [
# self._create_file('cluster 1', 'artist 1'),
# self._create_file('cluster 1', 'artist 2'),
# self._create_file('cluster 1', 'artist2'),
# self._create_file('cluster 1', 'artist 1'),
# self._create_file('cluster 1', 'artist 2'),
# ]
# clusters = list(Cluster.cluster(files))
# self.assertEqual(1, len(clusters))
# self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])
def test_common_artist_name(self):
files = [
self._create_file('cluster 1', 'artist 1'),
self._create_file('cluster 1', 'artist 2'),
self._create_file('cluster 1', 'artist2'),
self._create_file('cluster 1', 'artist 1'),
self._create_file('cluster 1', 'artist 2'),
]
clusters = list(Cluster.cluster(files))
self.assertEqual(1, len(clusters))
self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])
class FileClusterTest(PicardTestCase):
def test_single(self):
file = File('foo')
fc = FileCluster()
fc.add('album 1', 'artist 1', file)
self.assertEqual('album 1', fc.title)
self.assertEqual('artist 1', fc.artist)
self.assertEqual([file], fc.files)
def test_multi(self):
files = [
File('foo1'),
File('foo2'),
File('foo3'),
File('foo4'),
File('foo5'),
]
fc = FileCluster()
fc.add('album 1', 'artist1', files[0])
fc.add('Album 1', 'artist 2', files[1])
fc.add('album\t1', 'Artist 1', files[2])
fc.add('Album 1', 'Artist 2', files[3])
fc.add('album 2', 'Artist 1', files[4])
self.assertEqual('Album 1', fc.title)
self.assertEqual('Artist 1', fc.artist)
self.assertEqual(files, fc.files)