Spaces:
Runtime error
Runtime error
remove special tokens
Browse files
metaanalyser/paper/vectorstore.py
CHANGED
@@ -1,4 +1,6 @@
|
|
|
|
1 |
import logging
|
|
|
2 |
from langchain.embeddings import OpenAIEmbeddings
|
3 |
from langchain.text_splitter import SpacyTextSplitter
|
4 |
from langchain.vectorstores import FAISS
|
@@ -21,6 +23,14 @@ def create_papers_vectorstor(
|
|
21 |
chunk_size=chunk_size,
|
22 |
chunk_overlap=chunk_overlap,
|
23 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
logger.info(
|
26 |
f"Creating vector store,"
|
@@ -29,7 +39,7 @@ def create_papers_vectorstor(
|
|
29 |
)
|
30 |
|
31 |
docs = splitter.create_documents(
|
32 |
-
[p.text
|
33 |
metadatas=[
|
34 |
{
|
35 |
'google_scholar_result_id': p.google_scholar_result_id,
|
|
|
1 |
+
import functools
|
2 |
import logging
|
3 |
+
import tiktoken
|
4 |
from langchain.embeddings import OpenAIEmbeddings
|
5 |
from langchain.text_splitter import SpacyTextSplitter
|
6 |
from langchain.vectorstores import FAISS
|
|
|
23 |
chunk_size=chunk_size,
|
24 |
chunk_overlap=chunk_overlap,
|
25 |
)
|
26 |
+
enc = tiktoken.encoding_for_model(tiktoken_encoder_model_name)
|
27 |
+
|
28 |
+
def format_text(text):
|
29 |
+
return functools.reduce(
|
30 |
+
lambda text, special_token: text.replace(special_token, ""),
|
31 |
+
list(enc.special_tokens_set),
|
32 |
+
text
|
33 |
+
).replace("\n", " ")
|
34 |
|
35 |
logger.info(
|
36 |
f"Creating vector store,"
|
|
|
39 |
)
|
40 |
|
41 |
docs = splitter.create_documents(
|
42 |
+
[format_text(p.text) for p in tqdm(papers)],
|
43 |
metadatas=[
|
44 |
{
|
45 |
'google_scholar_result_id': p.google_scholar_result_id,
|