jonathanjordan21 commited on
Commit
b4d6f98
1 Parent(s): 82a6d63

Update custom_llm.py

Browse files
Files changed (1) hide show
  1. custom_llm.py +29 -2
custom_llm.py CHANGED
@@ -16,21 +16,48 @@ from langchain_core.messages import AIMessage, HumanMessage
16
  from langchain_community.document_loaders import DirectoryLoader
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain_community.document_loaders import PyMuPDFLoader
19
- import os
20
  from langchain.embeddings import HuggingFaceEmbeddings
21
  from langchain.vectorstores import FAISS
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def create_vectorstore():
25
  loader = os.getenv('knowledge_base')
 
26
 
27
  splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)
28
 
29
- docs = splitter.create_documents([loader])
30
 
31
  emb_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', encode_kwargs={'normalize_embeddings': True})
32
  db = FAISS.from_documents(docs, emb_model)
33
  return db
 
34
 
35
  def custom_chain_with_history(llm, memory):
36
 
 
16
  from langchain_community.document_loaders import DirectoryLoader
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain_community.document_loaders import PyMuPDFLoader
19
+ import os, requests, bs4
20
  from langchain.embeddings import HuggingFaceEmbeddings
21
  from langchain.vectorstores import FAISS
22
 
23
 
24
+ def load_web(web_url):
25
+
26
+ r = requests.get(web_url)
27
+ soup=bs4.BeautifulSoup(r.content,"html.parser")
28
+ # input_list=
29
+ input_list = [div.text.strip() for div in soup.find_all("div") if div.text.strip() !='']
30
+
31
+ unique_strings = {}
32
+
33
+ for item in input_list:
34
+ # Remove '\n' and leading/trailing whitespaces
35
+ # cleaned_item = item.strip('\n').strip()
36
+ cleaned_item = item.strip()
37
+
38
+ # Check if the cleaned_item is not in the dictionary or if it's shorter
39
+ if cleaned_item not in unique_strings or len(item) > len(unique_strings[cleaned_item]):
40
+ # Add the cleaned_item to the dictionary with the original item as value
41
+ unique_strings[cleaned_item] = item
42
+
43
+ # Create a new list with the unique strings
44
+ result_list = list(unique_strings.values())
45
+
46
+ return result_list
47
+
48
+
49
  def create_vectorstore():
50
  loader = os.getenv('knowledge_base')
51
+ web_loader = load_web("https://lintasmediadanawa.com")
52
 
53
  splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=20)
54
 
55
+ docs = splitter.create_documents([loader]+web_loader)
56
 
57
  emb_model = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', encode_kwargs={'normalize_embeddings': True})
58
  db = FAISS.from_documents(docs, emb_model)
59
  return db
60
+
61
 
62
  def custom_chain_with_history(llm, memory):
63