iohanngrig commited on
Commit
f3175f2
1 Parent(s): 1a3e29b

Delete pages/utils

Browse files
Files changed (1) hide show
  1. pages/utils/process_data.py +0 -72
pages/utils/process_data.py DELETED
@@ -1,72 +0,0 @@
1
- from PyPDF2 import PdfReader
2
- from langchain.text_splitter import CharacterTextSplitter
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import FAISS
5
-
6
-
7
- CHUNK_SIZE = 1024
8
- MAX_CHUNKS = 500
9
-
10
-
11
- def split_text_into_chunks(text, chunk_size=CHUNK_SIZE):
12
- """
13
- Splits text into smaller chunks.
14
- Args:
15
- text (str): Text to be split.
16
- chunk_size (int, optional): Size of each chunk. Defaults to 4,000.
17
- Returns:
18
- list[str]: List of text chunks.
19
- """
20
- chunks = []
21
- for i in range(0, len(text), chunk_size):
22
- chunks.append(text[i : i + chunk_size])
23
- return chunks
24
-
25
-
26
- def generate_chunks(inp_str, max_chunks=MAX_CHUNKS):
27
- """ Chunk text into smaller pieces."""
28
- inp_str = inp_str.replace('.', '.<eos>')
29
- inp_str = inp_str.replace('?', '?<eos>')
30
- inp_str = inp_str.replace('!', '!<eos>')
31
-
32
- sentences = inp_str.split('<eos>')
33
- current_chunk = 0
34
- chunks = []
35
- for sentence in sentences:
36
- if len(chunks) == current_chunk + 1:
37
- if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunks:
38
- chunks[current_chunk].extend(sentence.split(' '))
39
- else:
40
- current_chunk += 1
41
- chunks.append(sentence.split(' '))
42
- else:
43
- chunks.append(sentence.split(' '))
44
- return [' '.join(chunk) for chunk in chunks]
45
-
46
-
47
- def pdf_to_text(pdf_path):
48
- """
49
- Converts a PDF file to text.
50
- Args:
51
- pdf_path (str): Path to the PDF file.
52
- Returns:
53
- str: Extracted text from the PDF file.
54
- """
55
- reader = PdfReader(pdf_path)
56
- extracted_texts = [page.extract_text() for page in reader.pages]
57
- return " ".join(extracted_texts).replace("\n", " ")
58
-
59
-
60
- def process_text(text):
61
- """ Split the text into chunks using Langchain's CharacterTextSplitter """
62
- text_splitter = CharacterTextSplitter(
63
- separator="\n",
64
- chunk_size=CHUNK_SIZE,
65
- chunk_overlap=200,
66
- length_function=len
67
- )
68
- chunks = text_splitter.split_text(text)
69
- # Convert the chunks of text into embeddings to form a knowledge base
70
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
71
- knowledgeBase = FAISS.from_texts(chunks, embeddings)
72
- return knowledgeBase