Spaces:
Sleeping
Sleeping
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| from text_processing import extract_text_from_pdf, clean_text | |
| def chunk_text(text, chunk_size=1000, chunk_overlap=150): | |
| """ | |
| Split text into overlapping chunks. | |
| Args: | |
| text (str): Input text. | |
| chunk_size (int): Size of each chunk. | |
| chunk_overlap (int): Overlap between chunks. | |
| Returns: | |
| list: List of text chunks. | |
| """ | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", " "] | |
| ) | |
| return text_splitter.split_text(text) | |
| def generate_embeddings(chunks, model_name='all-MiniLM-L6-v2'): | |
| """ | |
| Generate embeddings for text chunks. | |
| Args: | |
| chunks (list): List of text chunks. | |
| model_name (str): SentenceTransformer model name. | |
| Returns: | |
| np.ndarray: Array of embeddings. | |
| """ | |
| model = SentenceTransformer(model_name) | |
| return model.encode(chunks, convert_to_numpy=True) | |
| def process_pdf_for_rag(pdf_path, chunk_size=500): | |
| """ | |
| Process a PDF for RAG by extracting, cleaning, and chunking. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| chunk_size (int): Size of each chunk. | |
| Returns: | |
| list: List of text chunks. | |
| """ | |
| print("Extracting text from PDF...") | |
| raw_text = extract_text_from_pdf(pdf_path) | |
| print("Cleaning text...") | |
| clean_text_content = clean_text(raw_text) | |
| print("Chunking text...") | |
| chunks = chunk_text(clean_text_content, chunk_size) | |
| print("Processing complete!") | |
| return chunks | |