mirror of
https://github.com/fergalmoran/picard.git
synced 2026-01-04 15:43:58 +00:00
PICARD-2339: Ensure clustering uses most common spelling of the same artist
This restores previous behavior, where a cluster primary artist is based on the tokenized artist name, but then the most common real spelling is being used.
This commit is contained in:
@@ -420,21 +420,25 @@ class ClusterList(list, Item):
|
||||
class FileCluster:
|
||||
def __init__(self):
|
||||
self.files = []
|
||||
self.artists = defaultdict(lambda: 0)
|
||||
self.titles = defaultdict(lambda: 0)
|
||||
self._artist_counts = defaultdict(lambda: 0)
|
||||
self._artists = defaultdict(lambda: defaultdict(lambda: 0))
|
||||
self._titles = defaultdict(lambda: 0)
|
||||
|
||||
def add(self, album, artist, file):
|
||||
self.files.append(file)
|
||||
self.artists[artist] += 1
|
||||
self.titles[album] += 1
|
||||
self._artist_counts[tokenize(artist)] += 1
|
||||
self._artists[tokenize(artist)][artist] += 1
|
||||
self._titles[album] += 1
|
||||
|
||||
@property
|
||||
def artist(self):
|
||||
return max(self.artists.items(), key=itemgetter(1))[0]
|
||||
tokenized_artist = max(self._artist_counts.items(), key=itemgetter(1))[0]
|
||||
return max(self._artists[tokenized_artist].items(), key=itemgetter(1))[0]
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
return max(self.titles.items(), key=itemgetter(1))[0]
|
||||
# Find the most common title
|
||||
return max(self._titles.items(), key=itemgetter(1))[0]
|
||||
|
||||
|
||||
_re_non_alphanum = re.compile(r'\W', re.UNICODE)
|
||||
|
||||
@@ -24,6 +24,7 @@ from test.picardtestcase import PicardTestCase
|
||||
|
||||
from picard.cluster import (
|
||||
Cluster,
|
||||
FileCluster,
|
||||
tokenize,
|
||||
)
|
||||
from picard.file import File
|
||||
@@ -120,14 +121,43 @@ class ClusterTest(PicardTestCase):
|
||||
clusters = list(Cluster.cluster(files))
|
||||
self.assertEqual(0, len(clusters))
|
||||
|
||||
# def test_common_artist_name(self):
|
||||
# files = [
|
||||
# self._create_file('cluster 1', 'artist 1'),
|
||||
# self._create_file('cluster 1', 'artist 2'),
|
||||
# self._create_file('cluster 1', 'artist2'),
|
||||
# self._create_file('cluster 1', 'artist 1'),
|
||||
# self._create_file('cluster 1', 'artist 2'),
|
||||
# ]
|
||||
# clusters = list(Cluster.cluster(files))
|
||||
# self.assertEqual(1, len(clusters))
|
||||
# self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])
|
||||
def test_common_artist_name(self):
|
||||
files = [
|
||||
self._create_file('cluster 1', 'artist 1'),
|
||||
self._create_file('cluster 1', 'artist 2'),
|
||||
self._create_file('cluster 1', 'artist2'),
|
||||
self._create_file('cluster 1', 'artist 1'),
|
||||
self._create_file('cluster 1', 'artist 2'),
|
||||
]
|
||||
clusters = list(Cluster.cluster(files))
|
||||
self.assertEqual(1, len(clusters))
|
||||
self.assertClusterEqual('cluster 1', 'artist 2', files, clusters[0])
|
||||
|
||||
|
||||
class FileClusterTest(PicardTestCase):
|
||||
|
||||
def test_single(self):
|
||||
file = File('foo')
|
||||
fc = FileCluster()
|
||||
fc.add('album 1', 'artist 1', file)
|
||||
self.assertEqual('album 1', fc.title)
|
||||
self.assertEqual('artist 1', fc.artist)
|
||||
self.assertEqual([file], fc.files)
|
||||
|
||||
def test_multi(self):
|
||||
files = [
|
||||
File('foo1'),
|
||||
File('foo2'),
|
||||
File('foo3'),
|
||||
File('foo4'),
|
||||
File('foo5'),
|
||||
]
|
||||
fc = FileCluster()
|
||||
fc.add('album 1', 'artist1', files[0])
|
||||
fc.add('Album 1', 'artist 2', files[1])
|
||||
fc.add('album\t1', 'Artist 1', files[2])
|
||||
fc.add('Album 1', 'Artist 2', files[3])
|
||||
fc.add('album 2', 'Artist 1', files[4])
|
||||
self.assertEqual('Album 1', fc.title)
|
||||
self.assertEqual('Artist 1', fc.artist)
|
||||
self.assertEqual(files, fc.files)
|
||||
|
||||
Reference in New Issue
Block a user