Chris4K commited on
Commit
ac861f4
1 Parent(s): 04011d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -4
app.py CHANGED
@@ -44,6 +44,45 @@ load_dotenv()
44
  from langchain_community.document_loaders import TextLoader
45
  from langchain_experimental.text_splitter import SemanticChunker
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def load_txt(path="./a.cv.ckaller.2024.txt"):
48
  loader = TextLoader(path)
49
  document = loader.load()
@@ -60,13 +99,13 @@ def load_txt(path="./a.cv.ckaller.2024.txt"):
60
 
61
  ######
62
  # split the document into chunks
63
- a_text_splitter = RecursiveCharacterTextSplitter(
64
  chunk_size=1500,
65
  chunk_overlap=250,
66
  length_function=len,
67
  is_separator_regex=False,
68
  )
69
- a_document_chunks = a_text_splitter.split_documents(document)
70
 
71
 
72
  #######
@@ -86,9 +125,9 @@ def load_txt(path="./a.cv.ckaller.2024.txt"):
86
  #####
87
 
88
 
89
- text_splitter = SemanticChunker(HuggingFaceBgeEmbeddings())
90
 
91
- document_chunks = text_splitter.create_documents([state_of_the_union])
92
  print(document_chunks[0].page_content)
93
 
94
  # load from disk
 
44
  from langchain_community.document_loaders import TextLoader
45
  from langchain_experimental.text_splitter import SemanticChunker
46
 
47
+ #####################
48
+ from langchain import RecursiveCharacterTextSplitter
49
+ from langchain_core.documents import BaseDocumentTransformer, Document
50
+
51
+ class QQQSplitter(RecursiveCharacterTextSplitter):
52
+ def __init__(self):
53
+ super().__init__()
54
+
55
+ def split(self, documents: Iterable[Document]) -> List[Document]
56
+ """
57
+ Splits the given text whenever there is a "qqq" sequence.
58
+ """
59
+
60
+ documents = []
61
+
62
+
63
+
64
+
65
+
66
+ for doc in documents:
67
+ for char in doc.page_content:
68
+ if char == "q":
69
+ if len(current_part) > 0 and current_part[-1] == "q":
70
+ # Found a "qqq" sequence, split!
71
+ parts.append(current_part[:-1])
72
+ current_part = ""
73
+ else:
74
+ current_part += char
75
+ else:
76
+ current_part += char
77
+
78
+ parts.append(current_part)
79
+ print("cp " +current_part)
80
+ new_doc = Document(page_content=current_part, metadata=doc.metadata)
81
+ documents.append(new_doc)
82
+
83
+ return documents
84
+ ##############################
85
+
86
  def load_txt(path="./a.cv.ckaller.2024.txt"):
87
  loader = TextLoader(path)
88
  document = loader.load()
 
99
 
100
  ######
101
  # split the document into chunks
102
+ text_splitter = QQQSplitter(
103
  chunk_size=1500,
104
  chunk_overlap=250,
105
  length_function=len,
106
  is_separator_regex=False,
107
  )
108
+ document_chunks = text_splitter.split(document)
109
 
110
 
111
  #######
 
125
  #####
126
 
127
 
128
+ #text_splitter = SemanticChunker(HuggingFaceBgeEmbeddings())
129
 
130
+ #document_chunks = text_splitter.create_documents([state_of_the_union])
131
  print(document_chunks[0].page_content)
132
 
133
  # load from disk