## 1. Set up Sagemaker
*Explain more later...*

In [None]:
response = client.create_presigned_notebook_instance_url(
 NotebookInstanceName='string',
 SessionExpirationDurationInSeconds=123
)

In [4]:
import json
import sagemaker
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.huggingface import HuggingFaceModel

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
 "huggingface",
 version="0.8.2"
)

# Define Model and Endpoint configuration parameter
role = sagemaker.get_execution_role()
print(role)
endpoint_name = "falcon-40b-instruct-demo"
aws_region = "us-east-1"
hf_model_id = "tiiuae/falcon-40b-instruct" # model id from huggingface.co/models
instance_type = "ml.g5.12xlarge" # instance type to use for deployment
number_of_gpu = 4 # number of gpus to use for inference and tensor parallelism
health_check_timeout = 600 # Increase the timeout for the health check to 5 minutes for downloading the model


arn:aws:iam::907929678403:role/service-role/AmazonSageMaker-ExecutionRole-20230621T132010


In [5]:
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
 role=role,
 image_uri=llm_image,
 env={
 'HF_MODEL_ID': hf_model_id,
 # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
 'SM_NUM_GPUS': json.dumps(number_of_gpu),
 'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
 'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
 }
)

In [6]:
llm = llm_model.deploy(
 initial_instance_count=1,
 instance_type=instance_type,
 container_startup_health_check_timeout=health_check_timeout,
 endpoint_name=endpoint_name
)

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateEndpoint operation: The account-level service limit 'ml.g5.12xlarge for endpoint usage' is 2 Instances, with current utilization of 2 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

In [None]:
llm2.delete_endpoint()

In [None]:

from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain import PromptTemplate
from typing import Dict

class ContentHandler(LLMContentHandler):
 content_type = "application/json"
 accepts = "application/json"
 len_prompt = 0

 def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
 self.len_prompt = len(prompt)
 input_str = json.dumps(
 {"inputs": prompt,
 "parameters": {
 "do_sample": True,
 "top_p": 0.9,
 "temperature": 0.8,
 "max_new_tokens": 1024,
 "repetition_penalty": 1.03,
 "stop": ["\n\n", "Human:", "<|endoftext|>", ""]
 }})
 return input_str.encode('utf-8')

 def transform_output(self, output: bytes) -> str:
 response_json = output.read()
 res = json.loads(response_json)
 ans = res[0]['generated_text'][self.len_prompt:]
 ans = ans[:ans.rfind("Human")].strip()
 return ans


content_handler = ContentHandler()


prompt = PromptTemplate(
 input_variables=["history", "context", "new_message"],
 template="""
The following is a given context and history of a conversation between a human and a helpful AI assistant. 
The AI assistant answers the human's messages with full, informational responses by inferring on the information given in the context, but does not extrapolate or hallucinate any facts that are not given.
When answering questions with unclear answers, the AI assistant qualifies its responses as such.
The AI assistant pretends it is also human and that the context is objectively true and well-known and therefore avoids mentioning the context as a source.

Context: {context}

{history}
Human: {new_message}
AI:""")

def load_chain():
 llm = SagemakerEndpoint(
 endpoint_name=endpoint_name,
 region_name=aws_region,
 content_handler=content_handler
 )
 chain = load_qa_chain(llm=llm, chain_type="stuff", verbose=True, memory=ConversationBufferMemory(memory_key="history", input_key="new_message"), prompt=prompt)
 return chain


dachain = load_chain()

In [None]:
query = "What is Becton?"
dachain({"input_documents": docsearch.similarity_search(query, k=3), "new_message": query}, return_only_outputs=True)['output_text'].strip()

## Load DSS Website Data into ChromaDB
`urls` object defines what URLs are to be considered in the context database.

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings

# define URL sources
urls = [
 'https://www.dssinc.com/blog/2022/6/21/suicide-prevention-manager-enabling-the-veterans-affairs-to-achieve-high-reliability-in-suicide-risk-identification',
 'https://www.dssinc.com/blog/2022/8/9/dss-inc-announces-appointment-of-brion-bailey-as-director-of-federal-business-development', 
 'https://www.dssinc.com/blog/2022/3/21/march-22-is-diabetes-alertness-day-a-helpful-reminder-to-monitor-and-prevent-diabetes',
 'https://www.dssinc.com/blog/2023/5/24/supporting-the-vas-high-reliability-organization-journey-through-suicide-prevention',
 'https://www.dssinc.com/blog/2022/12/19/dss-theradoc-helps-battle-super-bugs-for-better-veteran-health',
 'https://www.dssinc.com/blog/2022/9/21/dss-inc-chosen-for-phase-two-of-mission-daybreak-vas-suicide-prevention-challenge',
 'https://www.dssinc.com/blog/2022/9/19/crescenz-va-medical-center-cmcvamc-deploys-the-dss-iconic-data-patient-case-manager-pcm-solution',
 'https://www.dssinc.com/blog/2022/5/9/federal-news-network-the-importance-of-va-supply-chain-modernization']

# load and split
loaders = UnstructuredURLLoader(urls=urls)
data = loaders.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)
print("Sources split into the following number of \"texts\":", len(texts))

# load embedding model
print("Loading embedding model...")
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

docsearch = Chroma.from_texts([t.page_content for t in texts], embeddings)

In [None]:
print("Getting AI response... @ ", datetime.datetime.now().strftime("%H:%M:%S"))
print(chain({"input_documents": docsearch.similarity_search(query, k=3), "new_message": query}, return_only_outputs=True)['output_text'].strip())

In [None]:

llm2.delete_endpoint()

In [None]:
from sagemaker.predictor import Predictor

llm2 = Predictor(endpoint_name)

In [None]:
dom = "d-bipui5yzbvlc"
print(f'https://{dom}.studio.{aws_region}.sagemaker.aws/studiolab/default/jupyter/proxy/6006/')