Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -44,6 +44,45 @@ load_dotenv()
|
|
44 |
from langchain_community.document_loaders import TextLoader
|
45 |
from langchain_experimental.text_splitter import SemanticChunker
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def load_txt(path="./a.cv.ckaller.2024.txt"):
|
48 |
loader = TextLoader(path)
|
49 |
document = loader.load()
|
@@ -60,13 +99,13 @@ def load_txt(path="./a.cv.ckaller.2024.txt"):
|
|
60 |
|
61 |
######
|
62 |
# split the document into chunks
|
63 |
-
|
64 |
chunk_size=1500,
|
65 |
chunk_overlap=250,
|
66 |
length_function=len,
|
67 |
is_separator_regex=False,
|
68 |
)
|
69 |
-
|
70 |
|
71 |
|
72 |
#######
|
@@ -86,9 +125,9 @@ def load_txt(path="./a.cv.ckaller.2024.txt"):
|
|
86 |
#####
|
87 |
|
88 |
|
89 |
-
text_splitter = SemanticChunker(HuggingFaceBgeEmbeddings())
|
90 |
|
91 |
-
document_chunks = text_splitter.create_documents([state_of_the_union])
|
92 |
print(document_chunks[0].page_content)
|
93 |
|
94 |
# load from disk
|
|
|
44 |
from langchain_community.document_loaders import TextLoader
|
45 |
from langchain_experimental.text_splitter import SemanticChunker
|
46 |
|
47 |
+
#####################
|
48 |
+
from langchain import RecursiveCharacterTextSplitter
|
49 |
+
from langchain_core.documents import BaseDocumentTransformer, Document
|
50 |
+
|
51 |
+
class QQQSplitter(RecursiveCharacterTextSplitter):
|
52 |
+
def __init__(self):
|
53 |
+
super().__init__()
|
54 |
+
|
55 |
+
def split(self, documents: Iterable[Document]) -> List[Document]
|
56 |
+
"""
|
57 |
+
Splits the given text whenever there is a "qqq" sequence.
|
58 |
+
"""
|
59 |
+
|
60 |
+
documents = []
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
for doc in documents:
|
67 |
+
for char in doc.page_content:
|
68 |
+
if char == "q":
|
69 |
+
if len(current_part) > 0 and current_part[-1] == "q":
|
70 |
+
# Found a "qqq" sequence, split!
|
71 |
+
parts.append(current_part[:-1])
|
72 |
+
current_part = ""
|
73 |
+
else:
|
74 |
+
current_part += char
|
75 |
+
else:
|
76 |
+
current_part += char
|
77 |
+
|
78 |
+
parts.append(current_part)
|
79 |
+
print("cp " +current_part)
|
80 |
+
new_doc = Document(page_content=current_part, metadata=doc.metadata)
|
81 |
+
documents.append(new_doc)
|
82 |
+
|
83 |
+
return documents
|
84 |
+
##############################
|
85 |
+
|
86 |
def load_txt(path="./a.cv.ckaller.2024.txt"):
|
87 |
loader = TextLoader(path)
|
88 |
document = loader.load()
|
|
|
99 |
|
100 |
######
|
101 |
# split the document into chunks
|
102 |
+
text_splitter = QQQSplitter(
|
103 |
chunk_size=1500,
|
104 |
chunk_overlap=250,
|
105 |
length_function=len,
|
106 |
is_separator_regex=False,
|
107 |
)
|
108 |
+
document_chunks = text_splitter.split(document)
|
109 |
|
110 |
|
111 |
#######
|
|
|
125 |
#####
|
126 |
|
127 |
|
128 |
+
#text_splitter = SemanticChunker(HuggingFaceBgeEmbeddings())
|
129 |
|
130 |
+
#document_chunks = text_splitter.create_documents([state_of_the_union])
|
131 |
print(document_chunks[0].page_content)
|
132 |
|
133 |
# load from disk
|