YchKhan commited on
Commit
74e0465
1 Parent(s): f817c38

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +14 -8
split_files_to_excel.py CHANGED
@@ -20,7 +20,9 @@ import unstructured
20
  from unstructured.partition.docx import partition_docx
21
  from unstructured.partition.auto import partition
22
 
23
- from transformers import AutoTokenizer
 
 
24
 
25
  from pypdf import PdfReader
26
 
@@ -40,14 +42,18 @@ embeddings = HuggingFaceEmbeddings(
40
 
41
 
42
 
43
- model_id = "mistralai/Mistral-7B-Instruct-v0.1"
44
- access_token = os.getenv("HUGGINGFACE_SPLITFILES_API_KEY")
 
 
 
 
 
 
 
 
 
45
 
46
- tokenizer = AutoTokenizer.from_pretrained(
47
- model_id,
48
- padding_side="left",
49
- token = access_token
50
- )
51
 
52
  text_splitter = CharacterTextSplitter(
53
  separator = "\n",
 
20
  from unstructured.partition.docx import partition_docx
21
  from unstructured.partition.auto import partition
22
 
23
+
24
+ import tiktoken
25
+ #from transformers import AutoTokenizer
26
 
27
  from pypdf import PdfReader
28
 
 
42
 
43
 
44
 
45
+ # model_id = "mistralai/Mistral-7B-Instruct-v0.1"
46
+ # access_token = os.getenv("HUGGINGFACE_SPLITFILES_API_KEY")
47
+
48
+ # tokenizer = AutoTokenizer.from_pretrained(
49
+ # model_id,
50
+ # padding_side="left",
51
+ # token = access_token
52
+ # )
53
+
54
+
55
+ tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
56
 
 
 
 
 
 
57
 
58
  text_splitter = CharacterTextSplitter(
59
  separator = "\n",