p-baleine commited on
Commit
972452d
1 Parent(s): eebb314

remove special tokens

Browse files
Files changed (1) hide show
  1. metaanalyser/paper/vectorstore.py +11 -1
metaanalyser/paper/vectorstore.py CHANGED
@@ -1,4 +1,6 @@
 
1
  import logging
 
2
  from langchain.embeddings import OpenAIEmbeddings
3
  from langchain.text_splitter import SpacyTextSplitter
4
  from langchain.vectorstores import FAISS
@@ -21,6 +23,14 @@ def create_papers_vectorstor(
21
  chunk_size=chunk_size,
22
  chunk_overlap=chunk_overlap,
23
  )
 
 
 
 
 
 
 
 
24
 
25
  logger.info(
26
  f"Creating vector store,"
@@ -29,7 +39,7 @@ def create_papers_vectorstor(
29
  )
30
 
31
  docs = splitter.create_documents(
32
- [p.text.replace("\n", " ") for p in tqdm(papers)],
33
  metadatas=[
34
  {
35
  'google_scholar_result_id': p.google_scholar_result_id,
 
1
+ import functools
2
  import logging
3
+ import tiktoken
4
  from langchain.embeddings import OpenAIEmbeddings
5
  from langchain.text_splitter import SpacyTextSplitter
6
  from langchain.vectorstores import FAISS
 
23
  chunk_size=chunk_size,
24
  chunk_overlap=chunk_overlap,
25
  )
26
+ enc = tiktoken.encoding_for_model(tiktoken_encoder_model_name)
27
+
28
+ def format_text(text):
29
+ return functools.reduce(
30
+ lambda text, special_token: text.replace(special_token, ""),
31
+ list(enc.special_tokens_set),
32
+ text
33
+ ).replace("\n", " ")
34
 
35
  logger.info(
36
  f"Creating vector store,"
 
39
  )
40
 
41
  docs = splitter.create_documents(
42
+ [format_text(p.text) for p in tqdm(papers)],
43
  metadatas=[
44
  {
45
  'google_scholar_result_id': p.google_scholar_result_id,