import datetime import uuid from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter import os from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader from collections import deque import re from bs4 import BeautifulSoup import requests from urllib.parse import urlparse import mimetypes from pathlib import Path import tiktoken # Regex pattern to match a URL HTTP_URL_PATTERN = r'^http[s]*://.+' mimetypes.init() media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']]) filter_strings = ['/email-protection#'] def transformApi(api_key=''): if api_key==os.getenv("TEMP_PWD"): return os.getenv("OPENAI_API_KEY") elif api_key is None or api_key=='': return 'Null' else: return api_key def get_hyperlinks(url): try: reqs = requests.get(url) if not reqs.headers.get('Content-Type').startswith("text/html") or 400<=reqs.status_code<600: return [] soup = BeautifulSoup(reqs.text, 'html.parser') except Exception as e: print(e) return [] hyperlinks = [] for link in soup.find_all('a', href=True): hyperlinks.append(link.get('href')) return hyperlinks # Function to get the hyperlinks from a URL that are within the same domain def get_domain_hyperlinks(local_domain, url): clean_links = [] for link in set(get_hyperlinks(url)): clean_link = None # If the link is a URL, check if it is within the same domain if re.search(HTTP_URL_PATTERN, link): # Parse the URL and check if the domain is the same url_obj = urlparse(link) if url_obj.netloc.replace('www.','') == local_domain.replace('www.',''): clean_link = link # If the link is not a URL, check if it is a relative link else: if link.startswith("/"): link = link[1:] elif link.startswith(("#", '?', 'mailto:')): continue if 'wp-content/uploads' in url: clean_link = url+ "/" + link else: clean_link = "https://" + local_domain + "/" + link if clean_link is not None: clean_link = clean_link.strip().rstrip('/').replace('/../', '/') if not any(x in clean_link for x in filter_strings): clean_links.append(clean_link) # Return the list of hyperlinks that are within the same domain return list(set(clean_links)) # this function will get you a list of all the URLs from the base URL def crawl(url, local_domain, prog=None): # Create a queue to store the URLs to crawl queue = deque([url]) # Create a set to store the URLs that have already been seen (no duplicates) seen = set([url]) # While the queue is not empty, continue crawling while queue: # Get the next URL from the queue url_pop = queue.pop() # Get the hyperlinks from the URL and add them to the queue for link in get_domain_hyperlinks(local_domain, url_pop): if link not in seen: queue.append(link) seen.add(link) if len(seen)>=100: return seen if prog is not None: prog(1, desc=f'Crawling: {url_pop}') return seen def ingestURL(documents, url, crawling=True, prog=None): url = url.rstrip('/') # Parse the URL and get the domain local_domain = urlparse(url).netloc if not (local_domain and url.startswith('http')): return documents print('Loading URL', url) if crawling: # crawl to get other webpages from this URL if prog is not None: prog(0, desc=f'Crawling: {url}') links = crawl(url, local_domain, prog) if prog is not None: prog(1, desc=f'Crawling: {url}') else: links = set([url]) # separate pdf and other links c_links, pdf_links = [], [] for x in links: if x.endswith('.pdf'): pdf_links.append(x) elif not x.endswith(media_files): c_links.append(x) # Clean links loader using WebBaseLoader if prog is not None: prog(0.5, desc=f'Ingesting: {url}') if c_links: loader = WebBaseLoader(list(c_links)) documents.extend(loader.load()) # remote PDFs loader for pdf_link in list(pdf_links): loader = PyMuPDFLoader(pdf_link) doc = loader.load() for x in doc: x.metadata['source'] = loader.source documents.extend(doc) return documents def ingestFiles(documents, files_list, prog=None): for fPath in files_list: doc = None if fPath.endswith('.pdf'): doc = PyMuPDFLoader(fPath).load() elif fPath.endswith('.txt') and not 'WhatsApp Chat with' in fPath: doc = TextLoader(fPath).load() elif fPath.endswith(('.doc', 'docx')): doc = Docx2txtLoader(fPath).load() elif 'WhatsApp Chat with' in fPath and fPath.endswith('.csv'): # Convert Whatsapp TXT files to CSV using https://whatstk.streamlit.app/ doc = WhatsAppChatLoader(fPath).load() else: pass if doc is not None and doc[0].page_content: if prog is not None: prog(1, desc='Loaded file: '+fPath.rsplit('/')[0]) print('Loaded file:', fPath) documents.extend(doc) return documents def data_ingestion(inputDir=None, file_list=[], url_list=[], prog=None): documents = [] # Ingestion from Input Directory if inputDir is not None: files = [str(x) for x in Path(inputDir).glob('**/*')] documents = ingestFiles(documents, files) if file_list: documents = ingestFiles(documents, file_list, prog) # Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader if url_list: for url in url_list: documents = ingestURL(documents, url, prog=prog) # Cleanup documents for x in documents: if 'WhatsApp Chat with' not in x.metadata['source']: x.page_content = x.page_content.strip().replace('\n', ' ').replace('\\n', ' ').replace(' ', ' ') # print(f"Total number of documents: {len(documents)}") return documents def split_docs(documents): # Splitting and Chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250) # default chunk size of 4000 makes around 1k tokens per doc. with k=4, this means 4k tokens input to LLM. docs = text_splitter.split_documents(documents) return docs def getSourcesFromMetadata(metadata, sourceOnly=True, sepFileUrl=True): # metadata: list of metadata dict from all documents setSrc = set() for x in metadata: metadataText = '' # we need to convert each metadata dict into a string format. This string will be added to a set if x is not None: # extract source first, and then extract all other items source = x['source'] source = source.rsplit('/',1)[-1] if 'http' not in source else source notSource = [] for k,v in x.items(): if v is not None and k!='source' and k in ['page', 'title']: notSource.extend([f"{k}: {v}"]) metadataText = ', '.join([f'source: {source}'] + notSource) if sourceOnly==False else source setSrc.add(metadataText) if sepFileUrl: src_files = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' not in x], key=str.casefold))])) src_urls = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' in x], key=str.casefold))])) src_files = 'Files:\n'+src_files if src_files else '' src_urls = 'URLs:\n'+src_urls if src_urls else '' newLineSep = '\n\n' if src_files and src_urls else '' return src_files + newLineSep + src_urls , len(setSrc) else: src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))])) return src_docs, len(setSrc) def getVsDict(embeddingFunc, docs, vsDict={}): # create chroma client if doesnt exist if vsDict.get('chromaClient') is None: vsDict['chromaDir'] = './vecstore/'+str(uuid.uuid1()) vsDict['chromaClient'] = Chroma(embedding_function=embeddingFunc, persist_directory=vsDict['chromaDir']) # clear chroma client before adding new docs if vsDict['chromaClient']._collection.count()>0: vsDict['chromaClient'].delete(vsDict['chromaClient'].get()['ids']) # add new docs to chroma client vsDict['chromaClient'].add_documents(docs) print('vectorstore count:',vsDict['chromaClient']._collection.count(), 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) return vsDict # used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function) def localData_vecStore(openApiKey=None, inputDir=None, file_list=[], url_list=[], vsDict={}): documents = data_ingestion(inputDir, file_list, url_list) if not documents: return {} docs = split_docs(documents) # Embeddings embeddings = OpenAIEmbeddings(openai_api_key=openApiKey) # create chroma client if doesnt exist vsDict_hd = getVsDict(embeddings, docs, vsDict) # get sources from metadata src_str = getSourcesFromMetadata(vsDict_hd['chromaClient'].get()['metadatas']) src_str = str(src_str[1]) + ' source document(s) successfully loaded in vector store.'+'\n\n' + src_str[0] print(src_str) return vsDict_hd def num_tokens_from_string(string, encoding_name = "cl100k_base"): """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) return num_tokens