RizzCon-Answering-Machine / vtt_process.py
t0mkaka's picture
First version
b9edf3f
import os
import sqlite3
import pandas as pd
from nltk.tokenize import word_tokenize
import re
# Function to chunk text into specified size with overlap and keep track of timestamps
def chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size=256, overlap=0.5):
words = word_tokenize(text)
chunks = []
step = int(chunk_size * (1 - overlap))
num_chunks = (len(words) - chunk_size + step) // step
for i in range(0, num_chunks * step, step):
chunk = words[i:i + chunk_size]
if len(chunk) < chunk_size:
break
chunk_start_ts = start_ts
chunk_end_ts = end_ts # Placeholder for real calculation, you might need to calculate it based on word timings
chunks.append((' '.join(chunk), chunk_start_ts, chunk_end_ts))
return chunks
# Function to read VTT files from the database
def read_vtt_files_from_db(db_path):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT folder_name, file_name FROM vtt_files")
vtt_files = cursor.fetchall()
conn.close()
return vtt_files
# Function to process VTT file and extract chunks with timestamps
def process_vtt_file(file_path, chunk_size=256, overlap=0.5):
with open(file_path, 'r') as file:
vtt_data = file.read()
# Regular expression to match the VTT format
pattern = r"(\d+)\n(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?): (.*?)\n"
matches = re.findall(pattern, vtt_data, re.DOTALL)
# Merge text by the same user and create chunks with timestamps
data = []
current_user = None
current_text = ""
start_ts = None
for match in matches:
id, start, end, user, text = match
if user != current_user:
if current_user is not None:
data.append((current_user, current_text, start_ts, previous_end))
current_user = user
current_text = text
start_ts = start
else:
current_text += " " + text
previous_end = end
if current_user is not None:
data.append((current_user, current_text, start_ts, previous_end))
chunks = []
chunk_id = 1
for user, text, start_ts, end_ts in data:
text_chunks = chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size, overlap)
for chunk, chunk_start_ts, chunk_end_ts in text_chunks:
chunks.append((chunk_id, chunk, chunk_start_ts, chunk_end_ts, user))
chunk_id += 1
return chunks
# Function to save chunks to the database
def save_chunks_to_db(db_path, folder_name, file_name, chunks):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS text_chunks (
id INTEGER PRIMARY KEY,
talkname TEXT,
filename TEXT,
chunkid INTEGER,
chunk TEXT,
start_ts TEXT,
end_ts TEXT,
username TEXT
)
''')
for chunk_id, chunk, chunk_start_ts, chunk_end_ts, user in chunks:
cursor.execute('''
INSERT INTO text_chunks (talkname, filename, chunkid, chunk, start_ts, end_ts, username)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (folder_name, file_name, chunk_id, chunk, chunk_start_ts, chunk_end_ts, user))
conn.commit()
conn.close()
# Main script to process all VTT files and save chunks
root_dir = '/Users/t0mkaka/Desktop/Network/vtt_files' # Replace with the path to your root folder
db_path = 'rag.db' # Replace with the path to your SQLite database
vtt_files = read_vtt_files_from_db(db_path)
for folder_name, file_name in vtt_files:
file_path = os.path.join(root_dir, file_name)
if os.path.exists(file_path):
chunks = process_vtt_file(file_path)
save_chunks_to_db(db_path, folder_name, file_name, chunks)
print("Processed and saved all text chunks with timestamps and usernames to the database.")