| | from pypdf import PdfReader
|
| | from typing import Dict, List
|
| | import re
|
| |
|
| | def load_documents(data_path: str) -> str:
|
| | '''
|
| | Read the linkedin pdf and the summary in the data folder
|
| |
|
| | Parameters:
|
| | - data_path (str): The path to the data folder
|
| |
|
| | Returns:
|
| | - output (Dict[str, str]): A dictionary containing the text document and summary
|
| | '''
|
| | reader = PdfReader(f"{data_path}\linkedin.pdf")
|
| | text_document = ""
|
| | for page in reader.pages:
|
| | text_document += page.extract_text()
|
| |
|
| | with open(f"{data_path}\summary.txt", "r") as f:
|
| | summary = f.read()
|
| | output = f"{text_document}\n{summary}"
|
| | return output
|
| |
|
| | def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]:
|
| | '''
|
| | Split the text into chunks of non-empty substrings
|
| |
|
| | Parameters:
|
| | - text (str): The text to split
|
| |
|
| | Returns:
|
| | - chunks (List[str]): A list of chunks of text
|
| | '''
|
| |
|
| |
|
| | text = re.sub(r'[\xa0\n]', " ", text)
|
| |
|
| |
|
| | words = text.split()
|
| | chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)]
|
| | return chunks
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | |