MarieGotthardt commited on
Commit
a4dcd8d
1 Parent(s): b4666e4

Upload song_guesser.py

Browse files
Files changed (1) hide show
  1. song_guesser.py +36 -0
song_guesser.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datasketch import MinHash
3
+
4
+ class SongGuesser:
5
+ @staticmethod
6
+ def guess_song(query):
7
+ with open("./swedish_christmas_songs.json", "r", encoding='utf-8') as f:
8
+ songs = json.load(f)
9
+ shingle_size = 5
10
+
11
+ query_shingles = get_shingles(query, shingle_size)
12
+ query_minhash = create_minhash(query_shingles)
13
+
14
+ max_sim = 0
15
+ max_name = ""
16
+
17
+ for song in songs:
18
+ song_lyrics = song['lyrics'].lower()
19
+ song_shingles = get_shingles(song_lyrics, shingle_size)
20
+ song_minhash = create_minhash(song_shingles)
21
+
22
+ estimated_jaccard = query_minhash.jaccard(song_minhash)
23
+ if estimated_jaccard > max_sim:
24
+ max_sim = estimated_jaccard
25
+ max_name = song['name']
26
+
27
+ return max_name
28
+
29
+ def get_shingles(text, shingle_size):
30
+ return set(text[i:i+shingle_size] for i in range(len(text) - shingle_size + 1))
31
+
32
+ def create_minhash(shingles, num_perm=128):
33
+ m = MinHash(num_perm=num_perm)
34
+ for shingle in shingles:
35
+ m.update(shingle.encode('utf8'))
36
+ return m