File size: 775 Bytes
bac8f2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from sklearn.feature_extraction.text import TfidfVectorizer

def find_matches(chunks, keywords, padding=500):
    # Preprocess chunks
    preprocessed_chunks = [' '.join(chunk.split()) for chunk in chunks]

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(preprocessed_chunks)

    # Compute cosine similarity between keywords and chunks
    keyword_vector = vectorizer.transform([' '.join(keywords)])
    cosine_similarities = tfidf_matrix.dot(keyword_vector.T).toarray().flatten()

    # Rank chunks based on cosine similarity
    results = {i: score for i, score in enumerate(cosine_similarities)}
    return dict(sorted(results.items(), key=lambda item: item[1], reverse=True))