enoreyes commited on
Commit
fa8c8ef
1 Parent(s): 4a49e2a

Update code

Browse files
Files changed (2) hide show
  1. ingest.py +10 -6
  2. ingest_examples.py +7 -2
ingest.py CHANGED
@@ -6,7 +6,7 @@ from markdown import markdown
6
  import pickle
7
  from bs4 import BeautifulSoup
8
  from langchain.text_splitter import CharacterTextSplitter
9
- from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
10
  from langchain.vectorstores import FAISS
11
  from InstructorEmbedding import INSTRUCTOR
12
 
@@ -25,22 +25,26 @@ for p in Path("docs").rglob("*"):
25
  continue
26
  if str(p).lower().endswith(('.md', '.mdx')):
27
  with open(p) as f:
28
- print(p)
29
  filename = os.path.splitext(p)[0]
30
  docs.append(clean_data(f.read()))
31
- metadatas.append({"source": filename})
 
 
32
 
33
  text_splitter = CharacterTextSplitter(
34
  separator="\n",
35
- chunk_size=512,
36
- chunk_overlap=64,
37
  length_function=len,
38
  )
39
 
40
  documents = text_splitter.create_documents(docs, metadatas=metadatas)
41
 
42
  print("making embedding")
43
- embedding = HuggingFaceEmbeddings()
 
 
 
44
 
45
  print("beginning construction of faiss")
46
  search_index = FAISS.from_documents(documents, embedding)
 
6
  import pickle
7
  from bs4 import BeautifulSoup
8
  from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.embeddings import HuggingFaceInstructEmbeddings, OpenAIEmbeddings
10
  from langchain.vectorstores import FAISS
11
  from InstructorEmbedding import INSTRUCTOR
12
 
 
25
  continue
26
  if str(p).lower().endswith(('.md', '.mdx')):
27
  with open(p) as f:
 
28
  filename = os.path.splitext(p)[0]
29
  docs.append(clean_data(f.read()))
30
+ newfile_name = filename.replace("\\", "/")[5:]
31
+ print("file:" + newfile_name)
32
+ metadatas.append({"source": newfile_name})
33
 
34
  text_splitter = CharacterTextSplitter(
35
  separator="\n",
36
+ chunk_size=768,
37
+ chunk_overlap=128,
38
  length_function=len,
39
  )
40
 
41
  documents = text_splitter.create_documents(docs, metadatas=metadatas)
42
 
43
  print("making embedding")
44
+ model_name = "hkunlp/instructor-large"
45
+ embed_instruction = "Represent the text from the Hugging Face code documentation"
46
+ query_instruction = "Query the most relevant text from the Hugging Face code documentation"
47
+ embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
48
 
49
  print("beginning construction of faiss")
50
  search_index = FAISS.from_documents(documents, embedding)
ingest_examples.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from pathlib import Path
4
  import pickle
5
  from langchain.vectorstores import FAISS
6
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
7
  from langchain.text_splitter import CharacterTextSplitter
8
  from langchain.prompts.example_selector import \
9
  SemanticSimilarityExampleSelector
@@ -41,11 +41,16 @@ rephrase_documents = [
41
  }
42
  ]
43
 
 
 
 
 
 
44
  example_selector = SemanticSimilarityExampleSelector.from_examples(
45
  # This is the list of examples available to select from.
46
  rephrase_documents,
47
  # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
48
- HuggingFaceEmbeddings(),
49
  # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
50
  FAISS,
51
  # This is the number of examples to produce.
 
3
  from pathlib import Path
4
  import pickle
5
  from langchain.vectorstores import FAISS
6
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
7
  from langchain.text_splitter import CharacterTextSplitter
8
  from langchain.prompts.example_selector import \
9
  SemanticSimilarityExampleSelector
 
41
  }
42
  ]
43
 
44
+ model_name = "hkunlp/instructor-large"
45
+ embed_instruction = "Represent the text from the Hugging Face code documentation"
46
+ query_instruction = "Query the most relevant text from the Hugging Face code documentation"
47
+ embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
48
+
49
  example_selector = SemanticSimilarityExampleSelector.from_examples(
50
  # This is the list of examples available to select from.
51
  rephrase_documents,
52
  # This is the embedding class used to produce embeddings which are used to measure semantic similarity.
53
+ embedding,
54
  # This is the VectorStore class that is used to store the embeddings and do a similarity search over.
55
  FAISS,
56
  # This is the number of examples to produce.