Spaces:
Runtime error
Runtime error
| import os | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.cluster import AgglomerativeClustering | |
| from scipy.spatial.distance import cosine | |
| from nltk.tokenize import sent_tokenize | |
| import nltk | |
| # Download necessary NLTK resources | |
| nltk.download('punkt') | |
| # Function to chunk text based on semantic similarity | |
| def semantic_chunking(text, model, threshold=0.5): | |
| sentences = sent_tokenize(text) | |
| embeddings = model.encode(sentences) | |
| distances = np.array([[cosine(embeddings[i], embeddings[j]) if i != j else 0 for j in range(len(embeddings))] for i in range(len(embeddings))]) | |
| clustering = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=threshold) | |
| clustering.fit(distances) | |
| chunks = [[] for _ in range(clustering.n_clusters_)] | |
| for sentence, label in zip(sentences, clustering.labels_): | |
| chunks[label].append(sentence) | |
| return [' '.join(chunk) for chunk in chunks] | |
| # Initialize the sentence-transformer model | |
| model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| # Function to process all files in a directory and save them to a new directory | |
| def process_directory(input_dir, output_dir): | |
| # Create output directory if it doesn't exist | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| for filename in os.listdir(input_dir): | |
| if filename.endswith('.txt'): | |
| input_file_path = os.path.join(input_dir, filename) | |
| output_file_path = os.path.join(output_dir, filename) | |
| with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as file: | |
| text = file.read() | |
| try: | |
| chunks = semantic_chunking(text, model) | |
| with open(output_file_path, 'w', encoding='utf-8') as output_file: | |
| for chunk in chunks: | |
| output_file.write(chunk + '\n\n') | |
| except: | |
| print('notchunkable') | |
| # Specify the input and output directories | |
| input_dir = 'docs_dump' | |
| output_dir = 'semchunksBIG' | |
| # Process the directory | |
| process_directory(input_dir, output_dir) |