Spaces:
Runtime error
Runtime error
| import os | |
| import sqlite3 | |
| import pandas as pd | |
| from nltk.tokenize import word_tokenize | |
| import re | |
| # Function to chunk text into specified size with overlap and keep track of timestamps | |
| def chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size=256, overlap=0.5): | |
| words = word_tokenize(text) | |
| chunks = [] | |
| step = int(chunk_size * (1 - overlap)) | |
| num_chunks = (len(words) - chunk_size + step) // step | |
| for i in range(0, num_chunks * step, step): | |
| chunk = words[i:i + chunk_size] | |
| if len(chunk) < chunk_size: | |
| break | |
| chunk_start_ts = start_ts | |
| chunk_end_ts = end_ts # Placeholder for real calculation, you might need to calculate it based on word timings | |
| chunks.append((' '.join(chunk), chunk_start_ts, chunk_end_ts)) | |
| return chunks | |
| # Function to read VTT files from the database | |
| def read_vtt_files_from_db(db_path): | |
| conn = sqlite3.connect(db_path) | |
| cursor = conn.cursor() | |
| cursor.execute("SELECT folder_name, file_name FROM vtt_files") | |
| vtt_files = cursor.fetchall() | |
| conn.close() | |
| return vtt_files | |
| # Function to process VTT file and extract chunks with timestamps | |
| def process_vtt_file(file_path, chunk_size=256, overlap=0.5): | |
| with open(file_path, 'r') as file: | |
| vtt_data = file.read() | |
| # Regular expression to match the VTT format | |
| pattern = r"(\d+)\n(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?): (.*?)\n" | |
| matches = re.findall(pattern, vtt_data, re.DOTALL) | |
| # Merge text by the same user and create chunks with timestamps | |
| data = [] | |
| current_user = None | |
| current_text = "" | |
| start_ts = None | |
| for match in matches: | |
| id, start, end, user, text = match | |
| if user != current_user: | |
| if current_user is not None: | |
| data.append((current_user, current_text, start_ts, previous_end)) | |
| current_user = user | |
| current_text = text | |
| start_ts = start | |
| else: | |
| current_text += " " + text | |
| previous_end = end | |
| if current_user is not None: | |
| data.append((current_user, current_text, start_ts, previous_end)) | |
| chunks = [] | |
| chunk_id = 1 | |
| for user, text, start_ts, end_ts in data: | |
| text_chunks = chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size, overlap) | |
| for chunk, chunk_start_ts, chunk_end_ts in text_chunks: | |
| chunks.append((chunk_id, chunk, chunk_start_ts, chunk_end_ts, user)) | |
| chunk_id += 1 | |
| return chunks | |
| # Function to save chunks to the database | |
| def save_chunks_to_db(db_path, folder_name, file_name, chunks): | |
| conn = sqlite3.connect(db_path) | |
| cursor = conn.cursor() | |
| cursor.execute(''' | |
| CREATE TABLE IF NOT EXISTS text_chunks ( | |
| id INTEGER PRIMARY KEY, | |
| talkname TEXT, | |
| filename TEXT, | |
| chunkid INTEGER, | |
| chunk TEXT, | |
| start_ts TEXT, | |
| end_ts TEXT, | |
| username TEXT | |
| ) | |
| ''') | |
| for chunk_id, chunk, chunk_start_ts, chunk_end_ts, user in chunks: | |
| cursor.execute(''' | |
| INSERT INTO text_chunks (talkname, filename, chunkid, chunk, start_ts, end_ts, username) | |
| VALUES (?, ?, ?, ?, ?, ?, ?) | |
| ''', (folder_name, file_name, chunk_id, chunk, chunk_start_ts, chunk_end_ts, user)) | |
| conn.commit() | |
| conn.close() | |
| # Main script to process all VTT files and save chunks | |
| root_dir = '/Users/t0mkaka/Desktop/Network/vtt_files' # Replace with the path to your root folder | |
| db_path = 'rag.db' # Replace with the path to your SQLite database | |
| vtt_files = read_vtt_files_from_db(db_path) | |
| for folder_name, file_name in vtt_files: | |
| file_path = os.path.join(root_dir, file_name) | |
| if os.path.exists(file_path): | |
| chunks = process_vtt_file(file_path) | |
| save_chunks_to_db(db_path, folder_name, file_name, chunks) | |
| print("Processed and saved all text chunks with timestamps and usernames to the database.") | |