iohanngrig commited on
Commit
d1ccbf9
1 Parent(s): f3175f2

Upload process_data.py

Browse files
Files changed (1) hide show
  1. utils/process_data.py +72 -0
utils/process_data.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ from langchain.text_splitter import CharacterTextSplitter
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+
6
+
7
+ CHUNK_SIZE = 1024
8
+ MAX_CHUNKS = 500
9
+
10
+
11
+ def split_text_into_chunks(text, chunk_size=CHUNK_SIZE):
12
+ """
13
+ Splits text into smaller chunks.
14
+ Args:
15
+ text (str): Text to be split.
16
+ chunk_size (int, optional): Size of each chunk. Defaults to 4,000.
17
+ Returns:
18
+ list[str]: List of text chunks.
19
+ """
20
+ chunks = []
21
+ for i in range(0, len(text), chunk_size):
22
+ chunks.append(text[i : i + chunk_size])
23
+ return chunks
24
+
25
+
26
+ def generate_chunks(inp_str, max_chunks=MAX_CHUNKS):
27
+ """ Chunk text into smaller pieces."""
28
+ inp_str = inp_str.replace('.', '.<eos>')
29
+ inp_str = inp_str.replace('?', '?<eos>')
30
+ inp_str = inp_str.replace('!', '!<eos>')
31
+
32
+ sentences = inp_str.split('<eos>')
33
+ current_chunk = 0
34
+ chunks = []
35
+ for sentence in sentences:
36
+ if len(chunks) == current_chunk + 1:
37
+ if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunks:
38
+ chunks[current_chunk].extend(sentence.split(' '))
39
+ else:
40
+ current_chunk += 1
41
+ chunks.append(sentence.split(' '))
42
+ else:
43
+ chunks.append(sentence.split(' '))
44
+ return [' '.join(chunk) for chunk in chunks]
45
+
46
+
47
+ def pdf_to_text(pdf_path):
48
+ """
49
+ Converts a PDF file to text.
50
+ Args:
51
+ pdf_path (str): Path to the PDF file.
52
+ Returns:
53
+ str: Extracted text from the PDF file.
54
+ """
55
+ reader = PdfReader(pdf_path)
56
+ extracted_texts = [page.extract_text() for page in reader.pages]
57
+ return " ".join(extracted_texts).replace("\n", " ")
58
+
59
+
60
+ def process_text(text):
61
+ """ Split the text into chunks using Langchain's CharacterTextSplitter """
62
+ text_splitter = CharacterTextSplitter(
63
+ separator="\n",
64
+ chunk_size=CHUNK_SIZE,
65
+ chunk_overlap=200,
66
+ length_function=len
67
+ )
68
+ chunks = text_splitter.split_text(text)
69
+ # Convert the chunks of text into embeddings to form a knowledge base
70
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
71
+ knowledgeBase = FAISS.from_texts(chunks, embeddings)
72
+ return knowledgeBase