from sklearn.feature_extraction.text import TfidfVectorizer def find_matches(chunks, keywords, padding=500): # Preprocess chunks preprocessed_chunks = [' '.join(chunk.split()) for chunk in chunks] # TF-IDF Vectorization vectorizer = TfidfVectorizer(lowercase=True, stop_words='english') tfidf_matrix = vectorizer.fit_transform(preprocessed_chunks) # Compute cosine similarity between keywords and chunks keyword_vector = vectorizer.transform([' '.join(keywords)]) cosine_similarities = tfidf_matrix.dot(keyword_vector.T).toarray().flatten() # Rank chunks based on cosine similarity results = {i: score for i, score in enumerate(cosine_similarities)} return dict(sorted(results.items(), key=lambda item: item[1], reverse=True))