Spaces:
Sleeping
Sleeping
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
from text_processing import extract_text_from_pdf, clean_text | |
def chunk_text(text, chunk_size=1000, chunk_overlap=150): | |
""" | |
Split text into overlapping chunks. | |
Args: | |
text (str): Input text. | |
chunk_size (int): Size of each chunk. | |
chunk_overlap (int): Overlap between chunks. | |
Returns: | |
list: List of text chunks. | |
""" | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
separators=["\n\n", "\n", " "] | |
) | |
return text_splitter.split_text(text) | |
def generate_embeddings(chunks, model_name='all-MiniLM-L6-v2'): | |
""" | |
Generate embeddings for text chunks. | |
Args: | |
chunks (list): List of text chunks. | |
model_name (str): SentenceTransformer model name. | |
Returns: | |
np.ndarray: Array of embeddings. | |
""" | |
model = SentenceTransformer(model_name) | |
return model.encode(chunks, convert_to_numpy=True) | |
def process_pdf_for_rag(pdf_path, chunk_size=500): | |
""" | |
Process a PDF for RAG by extracting, cleaning, and chunking. | |
Args: | |
pdf_path (str): Path to the PDF file. | |
chunk_size (int): Size of each chunk. | |
Returns: | |
list: List of text chunks. | |
""" | |
print("Extracting text from PDF...") | |
raw_text = extract_text_from_pdf(pdf_path) | |
print("Cleaning text...") | |
clean_text_content = clean_text(raw_text) | |
print("Chunking text...") | |
chunks = chunk_text(clean_text_content, chunk_size) | |
print("Processing complete!") | |
return chunks | |