ian commited on
Commit
19b9c91
1 Parent(s): d6b2cdd
README.md CHANGED
@@ -10,6 +10,8 @@ app_file: app.py
10
  pinned: true
11
  license: apache-2.0
12
  fullWidth: true
 
 
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  pinned: true
11
  license: apache-2.0
12
  fullWidth: true
13
+ preload_from_hub:
14
+ - BAAI/bge-base-en-v1.5
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
flowsettings.py CHANGED
@@ -1,11 +1,30 @@
 
 
1
  from pathlib import Path
2
 
3
  from decouple import config
4
  from theflow.settings.default import * # noqa
5
 
6
- user_cache_dir = Path("./cache")
7
- user_cache_dir.mkdir(parents=True, exist_ok=True)
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  COHERE_API_KEY = config("COHERE_API_KEY", default="")
11
  KH_MODE = "dev"
@@ -14,19 +33,19 @@ KH_FEATURE_USER_MANAGEMENT_ADMIN = str(
14
  config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin")
15
  )
16
  KH_FEATURE_USER_MANAGEMENT_PASSWORD = str(
17
- config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="Abc@123")
18
  )
19
  KH_ENABLE_ALEMBIC = False
20
- KH_DATABASE = f"sqlite:///{user_cache_dir / 'sql.db'}"
21
- KH_FILESTORAGE_PATH = str(user_cache_dir / "files")
22
 
23
  KH_DOCSTORE = {
24
  "__type__": "kotaemon.storages.SimpleFileDocumentStore",
25
- "path": str(user_cache_dir / "docstore"),
26
  }
27
  KH_VECTORSTORE = {
28
  "__type__": "kotaemon.storages.ChromaVectorStore",
29
- "path": str(user_cache_dir / "vectorstore"),
30
  }
31
  KH_LLMS = {}
32
  KH_EMBEDDINGS = {}
@@ -116,14 +135,30 @@ if config("LOCAL_MODEL", default=""):
116
  }
117
 
118
  if len(KH_EMBEDDINGS) < 1:
119
- KH_EMBEDDINGS["local-mxbai-large-v1"] = {
120
  "spec": {
121
  "__type__": "kotaemon.embeddings.FastEmbedEmbeddings",
122
- "model_name": "mixedbread-ai/mxbai-embed-large-v1",
123
  },
124
  "default": True,
125
  }
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  # KH_LLMS["qwen1.5"] = {
129
  # "spec": {
@@ -137,14 +172,6 @@ if len(KH_EMBEDDINGS) < 1:
137
  # }
138
 
139
 
140
- KH_REASONINGS = ["ktem.reasoning.simple.FullQAPipeline"]
141
- KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".format(
142
- config("AZURE_OPENAI_ENDPOINT", default=""),
143
- config("OPENAI_VISION_DEPLOYMENT_NAME", default="gpt-4-vision"),
144
- config("OPENAI_API_VERSION", default=""),
145
- )
146
-
147
-
148
  SETTINGS_APP = {
149
  "lang": {
150
  "name": "Language",
 
1
+ import os
2
+ from inspect import currentframe, getframeinfo
3
  from pathlib import Path
4
 
5
  from decouple import config
6
  from theflow.settings.default import * # noqa
7
 
8
+ cur_frame = currentframe()
9
+ if cur_frame is None:
10
+ raise ValueError("Cannot get the current frame.")
11
+ this_file = getframeinfo(cur_frame).filename
12
+ this_dir = Path(this_file).parent
13
 
14
+ # App can be ran from anywhere and it's not trivial to decide where to store app data.
15
+ # So let's use the same directory as the flowsetting.py file.
16
+ KH_APP_DATA_DIR = this_dir / "ktem_app_data"
17
+ KH_APP_DATA_DIR.mkdir(parents=True, exist_ok=True)
18
+
19
+ # User data directory
20
+ KH_USER_DATA_DIR = KH_APP_DATA_DIR / "user_data"
21
+ KH_USER_DATA_DIR.mkdir(parents=True, exist_ok=True)
22
+
23
+ # HF models can be big, let's store them in the app data directory so that it's easier
24
+ # for users to manage their storage.
25
+ # ref: https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache
26
+ os.environ["HF_HOME"] = str(KH_APP_DATA_DIR / "huggingface")
27
+ os.environ["HF_HUB_CACHE"] = str(KH_APP_DATA_DIR / "huggingface")
28
 
29
  COHERE_API_KEY = config("COHERE_API_KEY", default="")
30
  KH_MODE = "dev"
 
33
  config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin")
34
  )
35
  KH_FEATURE_USER_MANAGEMENT_PASSWORD = str(
36
+ config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="XsdMbe8zKP8KdeE@")
37
  )
38
  KH_ENABLE_ALEMBIC = False
39
+ KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"
40
+ KH_FILESTORAGE_PATH = str(KH_USER_DATA_DIR / "files")
41
 
42
  KH_DOCSTORE = {
43
  "__type__": "kotaemon.storages.SimpleFileDocumentStore",
44
+ "path": str(KH_USER_DATA_DIR / "docstore"),
45
  }
46
  KH_VECTORSTORE = {
47
  "__type__": "kotaemon.storages.ChromaVectorStore",
48
+ "path": str(KH_USER_DATA_DIR / "vectorstore"),
49
  }
50
  KH_LLMS = {}
51
  KH_EMBEDDINGS = {}
 
135
  }
136
 
137
  if len(KH_EMBEDDINGS) < 1:
138
+ KH_EMBEDDINGS["local-bge-base-en-v1.5"] = {
139
  "spec": {
140
  "__type__": "kotaemon.embeddings.FastEmbedEmbeddings",
141
+ "model_name": "BAAI/bge-base-en-v1.5",
142
  },
143
  "default": True,
144
  }
145
 
146
+ KH_REASONINGS = ["ktem.reasoning.simple.FullQAPipeline"]
147
+ KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".format(
148
+ config("AZURE_OPENAI_ENDPOINT", default=""),
149
+ config("OPENAI_VISION_DEPLOYMENT_NAME", default="gpt-4-vision"),
150
+ config("OPENAI_API_VERSION", default=""),
151
+ )
152
+
153
+ # KH_LLMS["qwen_local"] = {
154
+ # "spec": {
155
+ # "__type__": "kotaemon.llms.LlamaCppChat",
156
+ # "repo_id": "Qwen/Qwen1.5-0.5B-Chat-GGUF",
157
+ # "filename": "qwen1_5-0_5b-chat-q5_k_m.gguf",
158
+ # },
159
+ # "default": False,
160
+ # "cost": 0,
161
+ # }
162
 
163
  # KH_LLMS["qwen1.5"] = {
164
  # "spec": {
 
172
  # }
173
 
174
 
 
 
 
 
 
 
 
 
175
  SETTINGS_APP = {
176
  "lang": {
177
  "name": "Language",
ktem_app_data/user_data/docstore/index_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"6d7cf7ff-dca4-40e1-9186-dd520dc0ed81": {"id_": "6d7cf7ff-dca4-40e1-9186-dd520dc0ed81", "embedding": null, "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "54c4fd0d-39fa-44ef-ba71-7fdeb90ed566", "node_type": "4", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "567a2dd31afd488e2ec63f5bf423d4f42c16e47b3c72a3e7c1883fb6e499c56d", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "10894a5f-d540-4f40-a8b7-d21ebb761553", "node_type": "1", "metadata": {}, "hash": "2b29101b70d7cf134093ee463d46986c583929eecdc4613a4e81c33267200555", "class_name": "RelatedNodeInfo"}}, "text": "General\n\n\n\nWhat's different about Llama 2 from Llama 1?\n\n\n 1. We received unprecedented interest in the Llama 1 model we released for the research community \u00e2\u20ac\u201c more than 100,000\n individuals and organizations have applied for access to Llama 1 and tens of thousands are now using it to innovate. After external\n feedback, fine-tuning, and extensive safety evaluations, we made the decision to release the next version of Llama more broadly.\n 2. Llama 2 is also available under a permissive commercial license, whereas Llama 1 was limited to non-commercial use.\n 3. Llama 2 is capable of processing longer prompts than Llama 1 and is also designed to work more efficiently.\n 4. For Llama 2 we\u00e2\u20ac\u2122re pairing our release of our pretrained models with versions fine-tuned for helpfulness and safety. Sharing fine-\n tuned versions makes it easier to use our models while also improving safety performance.\n\n\nWhat if I want to access Llama 2 models but I'm not sure if my use is permitted under the Llama 2\nCommunity License?\n\n\nOn a limited case by case basis, we will consider bespoke licensing requests from individual entities. Please contact llama2@meta.com to\nprovide more details about your request.\n\n\nWhere did the data come from to train the models? Was any Meta user data leveraged for training the\nmodels?\n\n A combination of sources are used for training. These sources include information that is publicly available online and annotated\n data to train our models.\n Llama 2 is not trained on Meta user data.\n\n\nWhy are you not sharing the training datasets for Llama 2?\n\n\nWe believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational\nfine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models\nhave gone through Meta\u00e2\u20ac\u2122s internal Privacy Review process to ensure responsible data usage in building our products. We are\ndedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet\nevolving societal expectations.\n\n\nDid we use human annotators to develop the data for our models?\n\n\nYes. There are more details about our use of human annotators in the research paper.\n\n\nCan I use the output of the models to improve the Llama 2 family of models, even though I cannot use them\nfor other LLMs?\n\n\nIt's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM\nor otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware\nTraining (QAT) utilize such a technique and hence this is allowed.\n\n\nWhat is Llama 2's max sequence length?\n\n4096. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and\nexamples on fine-tuning can be found in the Llama Recipes repository.\n\n\nIs there a multilingual checkpoint for researchers to download?\n\fThe Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for\nnow there are a lot of community projects that fine-tune Llama models to support languages.\n\n\nWhat operating systems (OS) are officially supported?\n\n\nLinux is the only OS currently supported by this repo.\n\n\nI am getting the following error with the download script. What should I do?\n\ndownload.sh: 14: [[: not found\n\n\n\nMake sure to run the command as follows\n\n\n./download.sh\n\n\nI am getting 'Issue with the URL' as an error message. What do I do?\n\nHTTP request sent, awaiting response... 400 Bad Request\n\n\n\nThe issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url\ndefence wrapper. To avoid this problem, please select the url manually and copy it.\n\n\nDoes Llama 2 support other languages outside of English?\n\n\nThe model was primarily trained on English with a bit of additional data from 27 other languages (for more information, see Table 10 on\npage 20 of the Llama 2 paper). We do not expect the same level of performance in these languages as in English. You\u00e2\u20ac\u2122ll find the full\nlist of languages referenced in the research paper. You can look at some of the community lead projects to fine-tune Llama 2 models", "start_char_idx": 0, "end_char_idx": 4429, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "content": "General\n\n\n\nWhat's different about Llama 2 from Llama 1?\n\n\n 1. We received unprecedented interest in the Llama 1 model we released for the research community \u00e2\u20ac\u201c more than 100,000\n individuals and organizations have applied for access to Llama 1 and tens of thousands are now using it to innovate. After external\n feedback, fine-tuning, and extensive safety evaluations, we made the decision to release the next version of Llama more broadly.\n 2. Llama 2 is also available under a permissive commercial license, whereas Llama 1 was limited to non-commercial use.\n 3. Llama 2 is capable of processing longer prompts than Llama 1 and is also designed to work more efficiently.\n 4. For Llama 2 we\u00e2\u20ac\u2122re pairing our release of our pretrained models with versions fine-tuned for helpfulness and safety. Sharing fine-\n tuned versions makes it easier to use our models while also improving safety performance.\n\n\nWhat if I want to access Llama 2 models but I'm not sure if my use is permitted under the Llama 2\nCommunity License?\n\n\nOn a limited case by case basis, we will consider bespoke licensing requests from individual entities. Please contact llama2@meta.com to\nprovide more details about your request.\n\n\nWhere did the data come from to train the models? Was any Meta user data leveraged for training the\nmodels?\n\n A combination of sources are used for training. These sources include information that is publicly available online and annotated\n data to train our models.\n Llama 2 is not trained on Meta user data.\n\n\nWhy are you not sharing the training datasets for Llama 2?\n\n\nWe believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational\nfine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models\nhave gone through Meta\u00e2\u20ac\u2122s internal Privacy Review process to ensure responsible data usage in building our products. We are\ndedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet\nevolving societal expectations.\n\n\nDid we use human annotators to develop the data for our models?\n\n\nYes. There are more details about our use of human annotators in the research paper.\n\n\nCan I use the output of the models to improve the Llama 2 family of models, even though I cannot use them\nfor other LLMs?\n\n\nIt's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM\nor otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware\nTraining (QAT) utilize such a technique and hence this is allowed.\n\n\nWhat is Llama 2's max sequence length?\n\n4096. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and\nexamples on fine-tuning can be found in the Llama Recipes repository.\n\n\nIs there a multilingual checkpoint for researchers to download?\n\fThe Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for\nnow there are a lot of community projects that fine-tune Llama models to support languages.\n\n\nWhat operating systems (OS) are officially supported?\n\n\nLinux is the only OS currently supported by this repo.\n\n\nI am getting the following error with the download script. What should I do?\n\ndownload.sh: 14: [[: not found\n\n\n\nMake sure to run the command as follows\n\n\n./download.sh\n\n\nI am getting 'Issue with the URL' as an error message. What do I do?\n\nHTTP request sent, awaiting response... 400 Bad Request\n\n\n\nThe issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url\ndefence wrapper. To avoid this problem, please select the url manually and copy it.\n\n\nDoes Llama 2 support other languages outside of English?\n\n\nThe model was primarily trained on English with a bit of additional data from 27 other languages (for more information, see Table 10 on\npage 20 of the Llama 2 paper). We do not expect the same level of performance in these languages as in English. You\u00e2\u20ac\u2122ll find the full\nlist of languages referenced in the research paper. You can look at some of the community lead projects to fine-tune Llama 2 models", "source": null, "channel": null, "class_name": "Document"}, "10894a5f-d540-4f40-a8b7-d21ebb761553": {"id_": "10894a5f-d540-4f40-a8b7-d21ebb761553", "embedding": null, "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "54c4fd0d-39fa-44ef-ba71-7fdeb90ed566", "node_type": "4", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "567a2dd31afd488e2ec63f5bf423d4f42c16e47b3c72a3e7c1883fb6e499c56d", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "6d7cf7ff-dca4-40e1-9186-dd520dc0ed81", "node_type": "1", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "64c184429fc7578aa51210283f6784e92d4544b4c3746f641665620691f9c15b", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "e3b960e7-28ce-4cf2-adbe-80ee5bb97a5b", "node_type": "1", "metadata": {}, "hash": "1699c8274fce55d965619c02112169305b838d6fe0688828158edbe32fff61ce", "class_name": "RelatedNodeInfo"}}, "text": "that fine-tune Llama models to support languages.\n\n\nWhat operating systems (OS) are officially supported?\n\n\nLinux is the only OS currently supported by this repo.\n\n\nI am getting the following error with the download script. What should I do?\n\ndownload.sh: 14: [[: not found\n\n\n\nMake sure to run the command as follows\n\n\n./download.sh\n\n\nI am getting 'Issue with the URL' as an error message. What do I do?\n\nHTTP request sent, awaiting response... 400 Bad Request\n\n\n\nThe issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url\ndefence wrapper. To avoid this problem, please select the url manually and copy it.\n\n\nDoes Llama 2 support other languages outside of English?\n\n\nThe model was primarily trained on English with a bit of additional data from 27 other languages (for more information, see Table 10 on\npage 20 of the Llama 2 paper). We do not expect the same level of performance in these languages as in English. You\u00e2\u20ac\u2122ll find the full\nlist of languages referenced in the research paper. You can look at some of the community lead projects to fine-tune Llama 2 models to\nsupport other languages. (eg. link)\n\n\nCan you run the Llama-7B model on Windows and/or macOS?\n\n\nThe vanilla model shipped in the repository does not run on Windows and/or macOS out of the box. There are some community led\nprojects that support running Llama on Mac, Windows, iOS, Android or anywhere (e.g llama cpp, MLC LLM, and Llama 2 Everywhere).\nYou can also find a work around at this issue based on Llama 2 fine-tuning.\n\n\nHow is the architecture of the v2 different from the one of the v1 model?\n\n\nSome differences between the two models include:\n\n 1. Llama 1 released 7, 13, 33 and 65 billion parameters while Llama 2 has7, 13 and 70 billion parameters\n 2. Llama 2 was trained on 40% more data\n 3. Llama2 has double the context length\n 4. Llama2 was fine-tuned for helpfulness and safety\n 5. Please review the research paper and model cards (llama 2 model card, llama 1 model card) for more differences.\n\n\nIf I'm a developer/business, how can I access it?\n\nDetails on how to access the models are available on our website link. Please note that the models are subject to the acceptable use\npolicy and the provided responsible use guide.ee the \u00e2\u20ac\u0153Accessing to Llama 2 Models\u00e2\u20ac\u200b section of this document for more information\non how to get access to the models.\n\n\nWhere can the models be found?\n\n\n 1. Models are available through multiple sources but the place to start is at https://llama.meta.com\n 2. Model code, quickstart guide and fine-tuning examples are available through our Github Llama repository. Model Weights are\n\f available through an email link after the user submits a sign-up form.\n 3. Models are also being hosted by Microsoft, Amazon Web Services, and Hugging Face, and may also be available through other\n hosting providers in the future.\n\n\nCan anyone access Llama 2? What are the terms?\n\n\n 1. Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website.\n 2. Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along\n with certain restrictions to help ensure that the models are being used responsibly.\n\n\nWhat are the hardware SKU requirements for deploying these models?\n\n\nHardware requirements vary based on latency, throughput and cost constraints. For good latency, we split models across multiple GPUs\nwith tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs, or even commodity hardware can also\nbe used to deploy these models (e.g. llama cpp, MLC LLM).\n\n\nIs Llama trained with multi-query attention(MQA) and ALiBi?\n\n\nOnly the 70B model has MQA for more efficient inference.\n\n\nDoes the model provide traditional autoregressive text completion?\n\n\nLlama 2 is an auto-regressive language model, built on the transformer architecture. Llama 2 functions by taking a sequence of words as\ninput and predicting the next word, recursively generating", "start_char_idx": 3277, "end_char_idx": 7404, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "content": "that fine-tune Llama models to support languages.\n\n\nWhat operating systems (OS) are officially supported?\n\n\nLinux is the only OS currently supported by this repo.\n\n\nI am getting the following error with the download script. What should I do?\n\ndownload.sh: 14: [[: not found\n\n\n\nMake sure to run the command as follows\n\n\n./download.sh\n\n\nI am getting 'Issue with the URL' as an error message. What do I do?\n\nHTTP request sent, awaiting response... 400 Bad Request\n\n\n\nThe issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url\ndefence wrapper. To avoid this problem, please select the url manually and copy it.\n\n\nDoes Llama 2 support other languages outside of English?\n\n\nThe model was primarily trained on English with a bit of additional data from 27 other languages (for more information, see Table 10 on\npage 20 of the Llama 2 paper). We do not expect the same level of performance in these languages as in English. You\u00e2\u20ac\u2122ll find the full\nlist of languages referenced in the research paper. You can look at some of the community lead projects to fine-tune Llama 2 models to\nsupport other languages. (eg. link)\n\n\nCan you run the Llama-7B model on Windows and/or macOS?\n\n\nThe vanilla model shipped in the repository does not run on Windows and/or macOS out of the box. There are some community led\nprojects that support running Llama on Mac, Windows, iOS, Android or anywhere (e.g llama cpp, MLC LLM, and Llama 2 Everywhere).\nYou can also find a work around at this issue based on Llama 2 fine-tuning.\n\n\nHow is the architecture of the v2 different from the one of the v1 model?\n\n\nSome differences between the two models include:\n\n 1. Llama 1 released 7, 13, 33 and 65 billion parameters while Llama 2 has7, 13 and 70 billion parameters\n 2. Llama 2 was trained on 40% more data\n 3. Llama2 has double the context length\n 4. Llama2 was fine-tuned for helpfulness and safety\n 5. Please review the research paper and model cards (llama 2 model card, llama 1 model card) for more differences.\n\n\nIf I'm a developer/business, how can I access it?\n\nDetails on how to access the models are available on our website link. Please note that the models are subject to the acceptable use\npolicy and the provided responsible use guide.ee the \u00e2\u20ac\u0153Accessing to Llama 2 Models\u00e2\u20ac\u200b section of this document for more information\non how to get access to the models.\n\n\nWhere can the models be found?\n\n\n 1. Models are available through multiple sources but the place to start is at https://llama.meta.com\n 2. Model code, quickstart guide and fine-tuning examples are available through our Github Llama repository. Model Weights are\n\f available through an email link after the user submits a sign-up form.\n 3. Models are also being hosted by Microsoft, Amazon Web Services, and Hugging Face, and may also be available through other\n hosting providers in the future.\n\n\nCan anyone access Llama 2? What are the terms?\n\n\n 1. Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website.\n 2. Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along\n with certain restrictions to help ensure that the models are being used responsibly.\n\n\nWhat are the hardware SKU requirements for deploying these models?\n\n\nHardware requirements vary based on latency, throughput and cost constraints. For good latency, we split models across multiple GPUs\nwith tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs, or even commodity hardware can also\nbe used to deploy these models (e.g. llama cpp, MLC LLM).\n\n\nIs Llama trained with multi-query attention(MQA) and ALiBi?\n\n\nOnly the 70B model has MQA for more efficient inference.\n\n\nDoes the model provide traditional autoregressive text completion?\n\n\nLlama 2 is an auto-regressive language model, built on the transformer architecture. Llama 2 functions by taking a sequence of words as\ninput and predicting the next word, recursively generating", "source": null, "channel": null, "class_name": "Document"}, "e3b960e7-28ce-4cf2-adbe-80ee5bb97a5b": {"id_": "e3b960e7-28ce-4cf2-adbe-80ee5bb97a5b", "embedding": null, "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "54c4fd0d-39fa-44ef-ba71-7fdeb90ed566", "node_type": "4", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "567a2dd31afd488e2ec63f5bf423d4f42c16e47b3c72a3e7c1883fb6e499c56d", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "10894a5f-d540-4f40-a8b7-d21ebb761553", "node_type": "1", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "dd3df5abf4539e419bfeb80cfbc4ff8efc8ea5a7bd0d645872e02e409fe43cc2", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "65d5beda-ff46-4976-9059-bf4155cbd683", "node_type": "1", "metadata": {}, "hash": "050e91f0ed647a672a3f156bb104893d623b26e4b07f5458befa37de68579faf", "class_name": "RelatedNodeInfo"}}, "text": "anyone access Llama 2? What are the terms?\n\n\n 1. Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website.\n 2. Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along\n with certain restrictions to help ensure that the models are being used responsibly.\n\n\nWhat are the hardware SKU requirements for deploying these models?\n\n\nHardware requirements vary based on latency, throughput and cost constraints. For good latency, we split models across multiple GPUs\nwith tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs, or even commodity hardware can also\nbe used to deploy these models (e.g. llama cpp, MLC LLM).\n\n\nIs Llama trained with multi-query attention(MQA) and ALiBi?\n\n\nOnly the 70B model has MQA for more efficient inference.\n\n\nDoes the model provide traditional autoregressive text completion?\n\n\nLlama 2 is an auto-regressive language model, built on the transformer architecture. Llama 2 functions by taking a sequence of words as\ninput and predicting the next word, recursively generating text.\n\n\nDoes the model support fill-in-the-middle completion, e.g. allowing the user to specify a suffix string for the\nresponse?\n\n\nThe vanilla model of Llama does not, however, the Code Llama models have been trained with fill-in-the-middle completion to assist with\ntasks like code completion.\n\n\nDoes the model support logit biases as a request parameter to control token probabilities during sampling?\n\n\nThis is implementation dependent (i.e. the code used to run the model).\n\n\nDoes the model support adjusting sampling temperature or top-p threshold via request parameters?\n\n\nThe model itself supports these parameters, but whether they are exposed or not depends on implementation.\n\n\nWhat is the most effective RAG method paired with LIama 2?\n\n\nThere are many ways to use RAG with Llama. The most popular libraries are LangChain and LlamaIndex, and many of our developers\nhave used them successfully with Llama 2. See the LangChain, LlamaIndex\n\n\nHow to setup Llama 2 on an EC2 instance?\n\n\nYou can find steps on how to set up an EC2 instance in the AWS section here.\n\n\nWhat is the right size of EC2 instances needed for running each of the llama models?\n\n\nThe AWS section has some insights on instance size that you can start with.\n\n\nShould we start training with the base or chat model?\n\fThis depends on your application. The Llama 2 pre-trained models were trained for general large language applications, whereas the\nLlama 2 chat models were fine-tuned for dialogue specific uses like chat bots. You should review the model card and research paper for\nmore information on the models as this will help you decide which to use.\n\n\nI keep getting a \u00e2\u20ac\u0153CUDA out of memory\u00e2\u20ac\u200b error.\n\n\nThis error can be caused by a number of different factors including, model size being too large, in-efficient memory usage and so on.\nSome of the steps below have been known to help with this issue, but you might need to do some troubleshooting to figure out the exact\ncause of your issue.\n\n 1. Ensure your GPU has enough memory\n 2. Reduce the `batch_size`\n 3. Lower thePrecision\n 4. Clear cache\n 5. Modify the Model/Training\n\n\nRetrieval approach adds latency due to multiple calls at each turn. How to best leverage Llama+Retrieval?\n\n\nIf multiple calls are necessary then you could look into the following:\n\n 1. Optimize inference so each call has less latency.\n 2. Merge the calls into fewer calls. For example summarize the data and utilize the summary.\n 3. Possibly utilize Llama 2 function calling.\n 4. Consider fine-tuning the model with the updated data.\n\n\nHow good is the model (assuming the fine-tuned one) for handling direct customer input without additional\nRAI layers?\n\nSpecial attention was paid to safety while fine-tuning the Llama 2 chat models. The Llama 2 chat models scored better than the Falcon\nand MPT in the TruthfulQA and ToxiGen benchmarks. More information can be found in Section 4 of the Llama 2 paper.\n\n\n\n\nFine-tuning\n\n\n\nHow can I fine-tune the Llama 2 models?\n\n\nYou can find examples on how to fine-tune the Llama 2 models in the Llama Recipes repository.\n\n\nHow can I pretrain the Llama 2 models?\n\n\nYou", "start_char_idx": 6235, "end_char_idx": 10551, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "content": "anyone access Llama 2? What are the terms?\n\n\n 1. Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website.\n 2. Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along\n with certain restrictions to help ensure that the models are being used responsibly.\n\n\nWhat are the hardware SKU requirements for deploying these models?\n\n\nHardware requirements vary based on latency, throughput and cost constraints. For good latency, we split models across multiple GPUs\nwith tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs, or even commodity hardware can also\nbe used to deploy these models (e.g. llama cpp, MLC LLM).\n\n\nIs Llama trained with multi-query attention(MQA) and ALiBi?\n\n\nOnly the 70B model has MQA for more efficient inference.\n\n\nDoes the model provide traditional autoregressive text completion?\n\n\nLlama 2 is an auto-regressive language model, built on the transformer architecture. Llama 2 functions by taking a sequence of words as\ninput and predicting the next word, recursively generating text.\n\n\nDoes the model support fill-in-the-middle completion, e.g. allowing the user to specify a suffix string for the\nresponse?\n\n\nThe vanilla model of Llama does not, however, the Code Llama models have been trained with fill-in-the-middle completion to assist with\ntasks like code completion.\n\n\nDoes the model support logit biases as a request parameter to control token probabilities during sampling?\n\n\nThis is implementation dependent (i.e. the code used to run the model).\n\n\nDoes the model support adjusting sampling temperature or top-p threshold via request parameters?\n\n\nThe model itself supports these parameters, but whether they are exposed or not depends on implementation.\n\n\nWhat is the most effective RAG method paired with LIama 2?\n\n\nThere are many ways to use RAG with Llama. The most popular libraries are LangChain and LlamaIndex, and many of our developers\nhave used them successfully with Llama 2. See the LangChain, LlamaIndex\n\n\nHow to setup Llama 2 on an EC2 instance?\n\n\nYou can find steps on how to set up an EC2 instance in the AWS section here.\n\n\nWhat is the right size of EC2 instances needed for running each of the llama models?\n\n\nThe AWS section has some insights on instance size that you can start with.\n\n\nShould we start training with the base or chat model?\n\fThis depends on your application. The Llama 2 pre-trained models were trained for general large language applications, whereas the\nLlama 2 chat models were fine-tuned for dialogue specific uses like chat bots. You should review the model card and research paper for\nmore information on the models as this will help you decide which to use.\n\n\nI keep getting a \u00e2\u20ac\u0153CUDA out of memory\u00e2\u20ac\u200b error.\n\n\nThis error can be caused by a number of different factors including, model size being too large, in-efficient memory usage and so on.\nSome of the steps below have been known to help with this issue, but you might need to do some troubleshooting to figure out the exact\ncause of your issue.\n\n 1. Ensure your GPU has enough memory\n 2. Reduce the `batch_size`\n 3. Lower thePrecision\n 4. Clear cache\n 5. Modify the Model/Training\n\n\nRetrieval approach adds latency due to multiple calls at each turn. How to best leverage Llama+Retrieval?\n\n\nIf multiple calls are necessary then you could look into the following:\n\n 1. Optimize inference so each call has less latency.\n 2. Merge the calls into fewer calls. For example summarize the data and utilize the summary.\n 3. Possibly utilize Llama 2 function calling.\n 4. Consider fine-tuning the model with the updated data.\n\n\nHow good is the model (assuming the fine-tuned one) for handling direct customer input without additional\nRAI layers?\n\nSpecial attention was paid to safety while fine-tuning the Llama 2 chat models. The Llama 2 chat models scored better than the Falcon\nand MPT in the TruthfulQA and ToxiGen benchmarks. More information can be found in Section 4 of the Llama 2 paper.\n\n\n\n\nFine-tuning\n\n\n\nHow can I fine-tune the Llama 2 models?\n\n\nYou can find examples on how to fine-tune the Llama 2 models in the Llama Recipes repository.\n\n\nHow can I pretrain the Llama 2 models?\n\n\nYou", "source": null, "channel": null, "class_name": "Document"}, "65d5beda-ff46-4976-9059-bf4155cbd683": {"id_": "65d5beda-ff46-4976-9059-bf4155cbd683", "embedding": null, "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "54c4fd0d-39fa-44ef-ba71-7fdeb90ed566", "node_type": "4", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "567a2dd31afd488e2ec63f5bf423d4f42c16e47b3c72a3e7c1883fb6e499c56d", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "e3b960e7-28ce-4cf2-adbe-80ee5bb97a5b", "node_type": "1", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "ed54637f120c9f11b5e6914e0726e7e47e194d2308d8d84a9af330fa5b28f50f", "class_name": "RelatedNodeInfo"}, "3": {"node_id": "ddbbcbc7-3208-4abb-9ea0-f297c763caf7", "node_type": "1", "metadata": {}, "hash": "c5bc7441e7f2a59fed38bd8c646fe64bac421dca26dcde361148217502b854f5", "class_name": "RelatedNodeInfo"}}, "text": "Modify the Model/Training\n\n\nRetrieval approach adds latency due to multiple calls at each turn. How to best leverage Llama+Retrieval?\n\n\nIf multiple calls are necessary then you could look into the following:\n\n 1. Optimize inference so each call has less latency.\n 2. Merge the calls into fewer calls. For example summarize the data and utilize the summary.\n 3. Possibly utilize Llama 2 function calling.\n 4. Consider fine-tuning the model with the updated data.\n\n\nHow good is the model (assuming the fine-tuned one) for handling direct customer input without additional\nRAI layers?\n\nSpecial attention was paid to safety while fine-tuning the Llama 2 chat models. The Llama 2 chat models scored better than the Falcon\nand MPT in the TruthfulQA and ToxiGen benchmarks. More information can be found in Section 4 of the Llama 2 paper.\n\n\n\n\nFine-tuning\n\n\n\nHow can I fine-tune the Llama 2 models?\n\n\nYou can find examples on how to fine-tune the Llama 2 models in the Llama Recipes repository.\n\n\nHow can I pretrain the Llama 2 models?\n\n\nYou can adapt the finetuning script found here for pre-training. You can also find the hyperparams used for pretraining in Section 2 of the\nLlama 2 paper.\n\n\nAm I allowed to develop derivative models through fine-tuning based on Llama 2 for languages other than\nenglish? Is this a violation of the acceptable use policy?\n\n\nDevelopers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and\nthe Acceptable Use Policy.\n\n\nHow can we reduce hallucination with fine-tuned LIama?\n\n\nAlthough prompts cannot eliminate hallucinations completely, they can reduce it significantly. Using techniques like Chain-of-thought,\nInstruction-Based, N-Shot, Few-Shot can help depending on your application. Additionally prompting the models to back up the responses\n\fby verifying with factual data sets or requesting the models to provide the source of information can help as well. Overall finetuning should\nalso be helpful for reducing hallucination.\n\n\nWill you release the tuning datasets?\n\n\nWe believe developers will have plenty to work with as we release our model weights and starting code for pre trained and conversational\nfine-tuned versions as well as responsible use of resources. While data mixes are intentionally withheld for competitive reasons, all\nmodels have gone through Meta\u00e2\u20ac\u2122s internal Privacy Review process to ensure responsible data usage in building our products. We are\ndedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet\nevolving societal expectations.\n\n\nWhat are the hardware SKU requirements for fine-tuning Llama pre-trained models?\n\nFine-tuning requirements also vary based on amount of data, time to complete fine-tuning and cost constraints. To fine-tune these models\nwe have generally used multiple NVIDIA A100 machines with data parallelism across nodes and a mix of data and tensor parallelism intra\nnode. But using a single machine, or other GPU types are definitely possible (e.g. alpaca models are trained on a single RTX4090:\nhttps://github.com/tloen/alpaca-lora)\n\n\nWhat fine-tuning tasks would these models support?\n\n\nThe Llama 2 fine-tuned models were fine-tuned for dialogue specific uses like chat bots.\n\n\nAre there examples on how one can fine-tune the models?\n\n\nYou can find example fine-tuning scripts in the Github recipes repository.\nYou can also review the fine-tuning section in our Getting started with Llama guide.\n\n\nWhat is the difference between a pre-trained and fine-tuned model?\n\n\nThe Llama 2 pre-trained models were trained for general large language applications, whereas the Llama 2 chat models were fine-tuned\nfor dialogue specific uses like chat bots.\n\n\nAre we going to publish model cards and info for each of the 4 models (7B/13B - PT and FT)?\n\n\nYou can find the Llama 2 model card here.\n\n\nWhat are some effective practices/ways that seem to be more effective at fine-tuning?\n\n\nYou can find some best practices in the fine-tuning section in our Getting started with Llama guide.\n\n\nHow effective is using LoRA?\n\n\nLoRA has made the fine-tuning of LLM like Llama 2 possible on consumer GPUs (like Tesla T4) by only retraining a very small set of\nmodel parameters, democratizing the fine-tuning of LLM, while still reaching comparable performance as fine-tuning the whole model on\nmuch expensive GPUs. So LoRA is essential and very effective in", "start_char_idx": 9506, "end_char_idx": 13976, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "content": "Modify the Model/Training\n\n\nRetrieval approach adds latency due to multiple calls at each turn. How to best leverage Llama+Retrieval?\n\n\nIf multiple calls are necessary then you could look into the following:\n\n 1. Optimize inference so each call has less latency.\n 2. Merge the calls into fewer calls. For example summarize the data and utilize the summary.\n 3. Possibly utilize Llama 2 function calling.\n 4. Consider fine-tuning the model with the updated data.\n\n\nHow good is the model (assuming the fine-tuned one) for handling direct customer input without additional\nRAI layers?\n\nSpecial attention was paid to safety while fine-tuning the Llama 2 chat models. The Llama 2 chat models scored better than the Falcon\nand MPT in the TruthfulQA and ToxiGen benchmarks. More information can be found in Section 4 of the Llama 2 paper.\n\n\n\n\nFine-tuning\n\n\n\nHow can I fine-tune the Llama 2 models?\n\n\nYou can find examples on how to fine-tune the Llama 2 models in the Llama Recipes repository.\n\n\nHow can I pretrain the Llama 2 models?\n\n\nYou can adapt the finetuning script found here for pre-training. You can also find the hyperparams used for pretraining in Section 2 of the\nLlama 2 paper.\n\n\nAm I allowed to develop derivative models through fine-tuning based on Llama 2 for languages other than\nenglish? Is this a violation of the acceptable use policy?\n\n\nDevelopers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and\nthe Acceptable Use Policy.\n\n\nHow can we reduce hallucination with fine-tuned LIama?\n\n\nAlthough prompts cannot eliminate hallucinations completely, they can reduce it significantly. Using techniques like Chain-of-thought,\nInstruction-Based, N-Shot, Few-Shot can help depending on your application. Additionally prompting the models to back up the responses\n\fby verifying with factual data sets or requesting the models to provide the source of information can help as well. Overall finetuning should\nalso be helpful for reducing hallucination.\n\n\nWill you release the tuning datasets?\n\n\nWe believe developers will have plenty to work with as we release our model weights and starting code for pre trained and conversational\nfine-tuned versions as well as responsible use of resources. While data mixes are intentionally withheld for competitive reasons, all\nmodels have gone through Meta\u00e2\u20ac\u2122s internal Privacy Review process to ensure responsible data usage in building our products. We are\ndedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet\nevolving societal expectations.\n\n\nWhat are the hardware SKU requirements for fine-tuning Llama pre-trained models?\n\nFine-tuning requirements also vary based on amount of data, time to complete fine-tuning and cost constraints. To fine-tune these models\nwe have generally used multiple NVIDIA A100 machines with data parallelism across nodes and a mix of data and tensor parallelism intra\nnode. But using a single machine, or other GPU types are definitely possible (e.g. alpaca models are trained on a single RTX4090:\nhttps://github.com/tloen/alpaca-lora)\n\n\nWhat fine-tuning tasks would these models support?\n\n\nThe Llama 2 fine-tuned models were fine-tuned for dialogue specific uses like chat bots.\n\n\nAre there examples on how one can fine-tune the models?\n\n\nYou can find example fine-tuning scripts in the Github recipes repository.\nYou can also review the fine-tuning section in our Getting started with Llama guide.\n\n\nWhat is the difference between a pre-trained and fine-tuned model?\n\n\nThe Llama 2 pre-trained models were trained for general large language applications, whereas the Llama 2 chat models were fine-tuned\nfor dialogue specific uses like chat bots.\n\n\nAre we going to publish model cards and info for each of the 4 models (7B/13B - PT and FT)?\n\n\nYou can find the Llama 2 model card here.\n\n\nWhat are some effective practices/ways that seem to be more effective at fine-tuning?\n\n\nYou can find some best practices in the fine-tuning section in our Getting started with Llama guide.\n\n\nHow effective is using LoRA?\n\n\nLoRA has made the fine-tuning of LLM like Llama 2 possible on consumer GPUs (like Tesla T4) by only retraining a very small set of\nmodel parameters, democratizing the fine-tuning of LLM, while still reaching comparable performance as fine-tuning the whole model on\nmuch expensive GPUs. So LoRA is essential and very effective in", "source": null, "channel": null, "class_name": "Document"}, "ddbbcbc7-3208-4abb-9ea0-f297c763caf7": {"id_": "ddbbcbc7-3208-4abb-9ea0-f297c763caf7", "embedding": null, "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "54c4fd0d-39fa-44ef-ba71-7fdeb90ed566", "node_type": "4", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "567a2dd31afd488e2ec63f5bf423d4f42c16e47b3c72a3e7c1883fb6e499c56d", "class_name": "RelatedNodeInfo"}, "2": {"node_id": "65d5beda-ff46-4976-9059-bf4155cbd683", "node_type": "1", "metadata": {"file_path": "C:\\Users\\Ian\\AppData\\Local\\Temp\\gradio\\6fa0b452b0635453636593a1efb33b6773415a85\\Llama-2-FAQs.txt", "file_name": "Llama-2-FAQs.txt", "file_type": "text/plain", "file_size": 17059, "creation_date": "2024-04-13", "last_modified_date": "2024-04-14", "last_accessed_date": "2024-04-14"}, "hash": "abb1e69079b41e7c820be1851c875d8e54b4a1a91629f7e5a00ebace3ed3d638", "class_name": "RelatedNodeInfo"}}, "text": "examples on how one can fine-tune the models?\n\n\nYou can find example fine-tuning scripts in the Github recipes repository.\nYou can also review the fine-tuning section in our Getting started with Llama guide.\n\n\nWhat is the difference between a pre-trained and fine-tuned model?\n\n\nThe Llama 2 pre-trained models were trained for general large language applications, whereas the Llama 2 chat models were fine-tuned\nfor dialogue specific uses like chat bots.\n\n\nAre we going to publish model cards and info for each of the 4 models (7B/13B - PT and FT)?\n\n\nYou can find the Llama 2 model card here.\n\n\nWhat are some effective practices/ways that seem to be more effective at fine-tuning?\n\n\nYou can find some best practices in the fine-tuning section in our Getting started with Llama guide.\n\n\nHow effective is using LoRA?\n\n\nLoRA has made the fine-tuning of LLM like Llama 2 possible on consumer GPUs (like Tesla T4) by only retraining a very small set of\nmodel parameters, democratizing the fine-tuning of LLM, while still reaching comparable performance as fine-tuning the whole model on\nmuch expensive GPUs. So LoRA is essential and very effective in Llama 2 fine-tuning.\n\n\nHow should we think about post processing (validate generated data) as a way to fine-tune models?\n\n\na. It depends on the application what type of data we are fine-tuning on, but it needs to be beyond normal harness eval sets, something\nthat makes sense for the application, for example for something like sql data, maybe running generate code would be a better eval. So\nessentially having a truthful data on the specific application can be helpful to reduce the risk on a specific application\n\n\n\nb. Also setting some sort of threshold such as prob>90% might be helpful to get more confidence in the output\n\fWhat are the different libraries that we recommend for fine-tuning?\n\nYou can find some fine-tuning recommendations in the Llama 2 Github recipes repository as well as fine-tuning section in our Getting\nstarted with Llama guide.\n\n\nHow can we identify the right \u00e2\u20ac\u02dcr\u00e2\u20ac\u2122 value for LORA method for a certain use-case?\n\n\nThe best approach would be to review the LoRA research paper for more information on the rankings, then reviewing similar\nimplementations for other models and finally experimenting.\n\n\nWe hope to use prompt engineering as a lever to nudge behavior. Any pointers on enhancing instruction-\nfollowing by fine-tuning small llama models?\n\nTake a look at the fine-tuning section in our Getting started with Llama guide for some pointers towards fine-tuning.\n\n\n\n\nPrompting\n\n\n\nStrategies to help models handle longer conversations?\n\n\nYou can find some helpful information towards this in the Prompting and Llama demo app.\n\n\n\n\nLegal\n\n\n\nIs Llama 2 open source? What is the exact license these models are published under?\n\n\n 1. This is a bespoke commercial license that balances open access to the models with responsibility and protections in place to help\n address potential misuse.\n 2. Our license allows for broad commercial use, as well as for developers to create and redistribute additional work on top of Llama 2.\n 3. We want to enable more innovation in both research and commercial use cases, but believe in taking a responsible approach to\n releasing AI technologies.\n 4. For more details, our license can be found here.\n\n\nIs there any copyrighted material in any of the training datasets?\n\n\nThe model is trained on a subset of publicly available text-based datasets.\n\n\n\n\nBenchmarking\n\fDo we have any benchmarks for the v2 models? Are we planning to publish it? If so, do we have an ETA?\n\n\nYes we will publish benchmarks alongside the release. If there are particular benchmarks partners are interested in it may be possible to\nshare some under NDA earlier.", "start_char_idx": 12831, "end_char_idx": 16591, "text_template": "{metadata_str}\n\n{content}", "metadata_template": "{key}: {value}", "metadata_seperator": "\n", "content": "examples on how one can fine-tune the models?\n\n\nYou can find example fine-tuning scripts in the Github recipes repository.\nYou can also review the fine-tuning section in our Getting started with Llama guide.\n\n\nWhat is the difference between a pre-trained and fine-tuned model?\n\n\nThe Llama 2 pre-trained models were trained for general large language applications, whereas the Llama 2 chat models were fine-tuned\nfor dialogue specific uses like chat bots.\n\n\nAre we going to publish model cards and info for each of the 4 models (7B/13B - PT and FT)?\n\n\nYou can find the Llama 2 model card here.\n\n\nWhat are some effective practices/ways that seem to be more effective at fine-tuning?\n\n\nYou can find some best practices in the fine-tuning section in our Getting started with Llama guide.\n\n\nHow effective is using LoRA?\n\n\nLoRA has made the fine-tuning of LLM like Llama 2 possible on consumer GPUs (like Tesla T4) by only retraining a very small set of\nmodel parameters, democratizing the fine-tuning of LLM, while still reaching comparable performance as fine-tuning the whole model on\nmuch expensive GPUs. So LoRA is essential and very effective in Llama 2 fine-tuning.\n\n\nHow should we think about post processing (validate generated data) as a way to fine-tune models?\n\n\na. It depends on the application what type of data we are fine-tuning on, but it needs to be beyond normal harness eval sets, something\nthat makes sense for the application, for example for something like sql data, maybe running generate code would be a better eval. So\nessentially having a truthful data on the specific application can be helpful to reduce the risk on a specific application\n\n\n\nb. Also setting some sort of threshold such as prob>90% might be helpful to get more confidence in the output\n\fWhat are the different libraries that we recommend for fine-tuning?\n\nYou can find some fine-tuning recommendations in the Llama 2 Github recipes repository as well as fine-tuning section in our Getting\nstarted with Llama guide.\n\n\nHow can we identify the right \u00e2\u20ac\u02dcr\u00e2\u20ac\u2122 value for LORA method for a certain use-case?\n\n\nThe best approach would be to review the LoRA research paper for more information on the rankings, then reviewing similar\nimplementations for other models and finally experimenting.\n\n\nWe hope to use prompt engineering as a lever to nudge behavior. Any pointers on enhancing instruction-\nfollowing by fine-tuning small llama models?\n\nTake a look at the fine-tuning section in our Getting started with Llama guide for some pointers towards fine-tuning.\n\n\n\n\nPrompting\n\n\n\nStrategies to help models handle longer conversations?\n\n\nYou can find some helpful information towards this in the Prompting and Llama demo app.\n\n\n\n\nLegal\n\n\n\nIs Llama 2 open source? What is the exact license these models are published under?\n\n\n 1. This is a bespoke commercial license that balances open access to the models with responsibility and protections in place to help\n address potential misuse.\n 2. Our license allows for broad commercial use, as well as for developers to create and redistribute additional work on top of Llama 2.\n 3. We want to enable more innovation in both research and commercial use cases, but believe in taking a responsible approach to\n releasing AI technologies.\n 4. For more details, our license can be found here.\n\n\nIs there any copyrighted material in any of the training datasets?\n\n\nThe model is trained on a subset of publicly available text-based datasets.\n\n\n\n\nBenchmarking\n\fDo we have any benchmarks for the v2 models? Are we planning to publish it? If so, do we have an ETA?\n\n\nYes we will publish benchmarks alongside the release. If there are particular benchmarks partners are interested in it may be possible to\nshare some under NDA earlier.", "source": null, "channel": null, "class_name": "Document"}}
ktem_app_data/user_data/files/6be00ed5b2e8573cd012b059834d24743ff9d0797cb72212cc8f6190e0af3a58 ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ General
2
+
3
+
4
+
5
+ What's different about Llama 2 from Llama 1?
6
+
7
+
8
+ 1. We received unprecedented interest in the Llama 1 model we released for the research community – more than 100,000
9
+ individuals and organizations have applied for access to Llama 1 and tens of thousands are now using it to innovate. After external
10
+ feedback, fine-tuning, and extensive safety evaluations, we made the decision to release the next version of Llama more broadly.
11
+ 2. Llama 2 is also available under a permissive commercial license, whereas Llama 1 was limited to non-commercial use.
12
+ 3. Llama 2 is capable of processing longer prompts than Llama 1 and is also designed to work more efficiently.
13
+ 4. For Llama 2 we’re pairing our release of our pretrained models with versions fine-tuned for helpfulness and safety. Sharing fine-
14
+ tuned versions makes it easier to use our models while also improving safety performance.
15
+
16
+
17
+ What if I want to access Llama 2 models but I'm not sure if my use is permitted under the Llama 2
18
+ Community License?
19
+
20
+
21
+ On a limited case by case basis, we will consider bespoke licensing requests from individual entities. Please contact llama2@meta.com to
22
+ provide more details about your request.
23
+
24
+
25
+ Where did the data come from to train the models? Was any Meta user data leveraged for training the
26
+ models?
27
+
28
+ A combination of sources are used for training. These sources include information that is publicly available online and annotated
29
+ data to train our models.
30
+ Llama 2 is not trained on Meta user data.
31
+
32
+
33
+ Why are you not sharing the training datasets for Llama 2?
34
+
35
+
36
+ We believe developers will have plenty to work with as we release our model weights and starting code for pre-trained and conversational
37
+ fine-tuned versions as well as responsible use resources. While data mixes are intentionally withheld for competitive reasons, all models
38
+ have gone through Meta’s internal Privacy Review process to ensure responsible data usage in building our products. We are
39
+ dedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet
40
+ evolving societal expectations.
41
+
42
+
43
+ Did we use human annotators to develop the data for our models?
44
+
45
+
46
+ Yes. There are more details about our use of human annotators in the research paper.
47
+
48
+
49
+ Can I use the output of the models to improve the Llama 2 family of models, even though I cannot use them
50
+ for other LLMs?
51
+
52
+
53
+ It's correct that the license restricts using any part of the Llama 2 models, including the response outputs to train another AI model (LLM
54
+ or otherwise). However, one can use the outputs to further train the Llama 2 family of models. Techniques such as Quantized Aware
55
+ Training (QAT) utilize such a technique and hence this is allowed.
56
+
57
+
58
+ What is Llama 2's max sequence length?
59
+
60
+ 4096. If you want to use more tokens, you will need to fine-tune the model so that it supports longer sequences. More information and
61
+ examples on fine-tuning can be found in the Llama Recipes repository.
62
+
63
+
64
+ Is there a multilingual checkpoint for researchers to download?
65
+ The Llama models thus far have been mainly focused on the English language. We are looking at true multi-linguality for the future but for
66
+ now there are a lot of community projects that fine-tune Llama models to support languages.
67
+
68
+
69
+ What operating systems (OS) are officially supported?
70
+
71
+
72
+ Linux is the only OS currently supported by this repo.
73
+
74
+
75
+ I am getting the following error with the download script. What should I do?
76
+
77
+ download.sh: 14: [[: not found
78
+
79
+
80
+
81
+ Make sure to run the command as follows
82
+
83
+
84
+ ./download.sh
85
+
86
+
87
+ I am getting 'Issue with the URL' as an error message. What do I do?
88
+
89
+ HTTP request sent, awaiting response... 400 Bad Request
90
+
91
+
92
+
93
+ The issue occurs because of not copying the URL correctly. If you right click on the link and copy the link, the link may be copied with url
94
+ defence wrapper. To avoid this problem, please select the url manually and copy it.
95
+
96
+
97
+ Does Llama 2 support other languages outside of English?
98
+
99
+
100
+ The model was primarily trained on English with a bit of additional data from 27 other languages (for more information, see Table 10 on
101
+ page 20 of the Llama 2 paper). We do not expect the same level of performance in these languages as in English. You’ll find the full
102
+ list of languages referenced in the research paper. You can look at some of the community lead projects to fine-tune Llama 2 models to
103
+ support other languages. (eg. link)
104
+
105
+
106
+ Can you run the Llama-7B model on Windows and/or macOS?
107
+
108
+
109
+ The vanilla model shipped in the repository does not run on Windows and/or macOS out of the box. There are some community led
110
+ projects that support running Llama on Mac, Windows, iOS, Android or anywhere (e.g llama cpp, MLC LLM, and Llama 2 Everywhere).
111
+ You can also find a work around at this issue based on Llama 2 fine-tuning.
112
+
113
+
114
+ How is the architecture of the v2 different from the one of the v1 model?
115
+
116
+
117
+ Some differences between the two models include:
118
+
119
+ 1. Llama 1 released 7, 13, 33 and 65 billion parameters while Llama 2 has7, 13 and 70 billion parameters
120
+ 2. Llama 2 was trained on 40% more data
121
+ 3. Llama2 has double the context length
122
+ 4. Llama2 was fine-tuned for helpfulness and safety
123
+ 5. Please review the research paper and model cards (llama 2 model card, llama 1 model card) for more differences.
124
+
125
+
126
+ If I'm a developer/business, how can I access it?
127
+
128
+ Details on how to access the models are available on our website link. Please note that the models are subject to the acceptable use
129
+ policy and the provided responsible use guide.ee the “Accessing to Llama 2 Modelsâ€​ section of this document for more information
130
+ on how to get access to the models.
131
+
132
+
133
+ Where can the models be found?
134
+
135
+
136
+ 1. Models are available through multiple sources but the place to start is at https://llama.meta.com
137
+ 2. Model code, quickstart guide and fine-tuning examples are available through our Github Llama repository. Model Weights are
138
+ available through an email link after the user submits a sign-up form.
139
+ 3. Models are also being hosted by Microsoft, Amazon Web Services, and Hugging Face, and may also be available through other
140
+ hosting providers in the future.
141
+
142
+
143
+ Can anyone access Llama 2? What are the terms?
144
+
145
+
146
+ 1. Llama 2 is broadly available to developers and licensees through a variety of hosting providers and on the Meta website.
147
+ 2. Llama 2 is licensed under the Llama 2 Community License Agreement, which provides a permissive license to the models along
148
+ with certain restrictions to help ensure that the models are being used responsibly.
149
+
150
+
151
+ What are the hardware SKU requirements for deploying these models?
152
+
153
+
154
+ Hardware requirements vary based on latency, throughput and cost constraints. For good latency, we split models across multiple GPUs
155
+ with tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs, or even commodity hardware can also
156
+ be used to deploy these models (e.g. llama cpp, MLC LLM).
157
+
158
+
159
+ Is Llama trained with multi-query attention(MQA) and ALiBi?
160
+
161
+
162
+ Only the 70B model has MQA for more efficient inference.
163
+
164
+
165
+ Does the model provide traditional autoregressive text completion?
166
+
167
+
168
+ Llama 2 is an auto-regressive language model, built on the transformer architecture. Llama 2 functions by taking a sequence of words as
169
+ input and predicting the next word, recursively generating text.
170
+
171
+
172
+ Does the model support fill-in-the-middle completion, e.g. allowing the user to specify a suffix string for the
173
+ response?
174
+
175
+
176
+ The vanilla model of Llama does not, however, the Code Llama models have been trained with fill-in-the-middle completion to assist with
177
+ tasks like code completion.
178
+
179
+
180
+ Does the model support logit biases as a request parameter to control token probabilities during sampling?
181
+
182
+
183
+ This is implementation dependent (i.e. the code used to run the model).
184
+
185
+
186
+ Does the model support adjusting sampling temperature or top-p threshold via request parameters?
187
+
188
+
189
+ The model itself supports these parameters, but whether they are exposed or not depends on implementation.
190
+
191
+
192
+ What is the most effective RAG method paired with LIama 2?
193
+
194
+
195
+ There are many ways to use RAG with Llama. The most popular libraries are LangChain and LlamaIndex, and many of our developers
196
+ have used them successfully with Llama 2. See the LangChain, LlamaIndex
197
+
198
+
199
+ How to setup Llama 2 on an EC2 instance?
200
+
201
+
202
+ You can find steps on how to set up an EC2 instance in the AWS section here.
203
+
204
+
205
+ What is the right size of EC2 instances needed for running each of the llama models?
206
+
207
+
208
+ The AWS section has some insights on instance size that you can start with.
209
+
210
+
211
+ Should we start training with the base or chat model?
212
+ This depends on your application. The Llama 2 pre-trained models were trained for general large language applications, whereas the
213
+ Llama 2 chat models were fine-tuned for dialogue specific uses like chat bots. You should review the model card and research paper for
214
+ more information on the models as this will help you decide which to use.
215
+
216
+
217
+ I keep getting a “CUDA out of memoryâ€​ error.
218
+
219
+
220
+ This error can be caused by a number of different factors including, model size being too large, in-efficient memory usage and so on.
221
+ Some of the steps below have been known to help with this issue, but you might need to do some troubleshooting to figure out the exact
222
+ cause of your issue.
223
+
224
+ 1. Ensure your GPU has enough memory
225
+ 2. Reduce the `batch_size`
226
+ 3. Lower thePrecision
227
+ 4. Clear cache
228
+ 5. Modify the Model/Training
229
+
230
+
231
+ Retrieval approach adds latency due to multiple calls at each turn. How to best leverage Llama+Retrieval?
232
+
233
+
234
+ If multiple calls are necessary then you could look into the following:
235
+
236
+ 1. Optimize inference so each call has less latency.
237
+ 2. Merge the calls into fewer calls. For example summarize the data and utilize the summary.
238
+ 3. Possibly utilize Llama 2 function calling.
239
+ 4. Consider fine-tuning the model with the updated data.
240
+
241
+
242
+ How good is the model (assuming the fine-tuned one) for handling direct customer input without additional
243
+ RAI layers?
244
+
245
+ Special attention was paid to safety while fine-tuning the Llama 2 chat models. The Llama 2 chat models scored better than the Falcon
246
+ and MPT in the TruthfulQA and ToxiGen benchmarks. More information can be found in Section 4 of the Llama 2 paper.
247
+
248
+
249
+
250
+
251
+ Fine-tuning
252
+
253
+
254
+
255
+ How can I fine-tune the Llama 2 models?
256
+
257
+
258
+ You can find examples on how to fine-tune the Llama 2 models in the Llama Recipes repository.
259
+
260
+
261
+ How can I pretrain the Llama 2 models?
262
+
263
+
264
+ You can adapt the finetuning script found here for pre-training. You can also find the hyperparams used for pretraining in Section 2 of the
265
+ Llama 2 paper.
266
+
267
+
268
+ Am I allowed to develop derivative models through fine-tuning based on Llama 2 for languages other than
269
+ english? Is this a violation of the acceptable use policy?
270
+
271
+
272
+ Developers may fine-tune Llama 2 models for languages beyond English provided they comply with the Llama 2 Community License and
273
+ the Acceptable Use Policy.
274
+
275
+
276
+ How can we reduce hallucination with fine-tuned LIama?
277
+
278
+
279
+ Although prompts cannot eliminate hallucinations completely, they can reduce it significantly. Using techniques like Chain-of-thought,
280
+ Instruction-Based, N-Shot, Few-Shot can help depending on your application. Additionally prompting the models to back up the responses
281
+ by verifying with factual data sets or requesting the models to provide the source of information can help as well. Overall finetuning should
282
+ also be helpful for reducing hallucination.
283
+
284
+
285
+ Will you release the tuning datasets?
286
+
287
+
288
+ We believe developers will have plenty to work with as we release our model weights and starting code for pre trained and conversational
289
+ fine-tuned versions as well as responsible use of resources. While data mixes are intentionally withheld for competitive reasons, all
290
+ models have gone through Meta’s internal Privacy Review process to ensure responsible data usage in building our products. We are
291
+ dedicated to the responsible and ethical development of our genAI products, ensuring our policies reflect diverse contexts and meet
292
+ evolving societal expectations.
293
+
294
+
295
+ What are the hardware SKU requirements for fine-tuning Llama pre-trained models?
296
+
297
+ Fine-tuning requirements also vary based on amount of data, time to complete fine-tuning and cost constraints. To fine-tune these models
298
+ we have generally used multiple NVIDIA A100 machines with data parallelism across nodes and a mix of data and tensor parallelism intra
299
+ node. But using a single machine, or other GPU types are definitely possible (e.g. alpaca models are trained on a single RTX4090:
300
+ https://github.com/tloen/alpaca-lora)
301
+
302
+
303
+ What fine-tuning tasks would these models support?
304
+
305
+
306
+ The Llama 2 fine-tuned models were fine-tuned for dialogue specific uses like chat bots.
307
+
308
+
309
+ Are there examples on how one can fine-tune the models?
310
+
311
+
312
+ You can find example fine-tuning scripts in the Github recipes repository.
313
+ You can also review the fine-tuning section in our Getting started with Llama guide.
314
+
315
+
316
+ What is the difference between a pre-trained and fine-tuned model?
317
+
318
+
319
+ The Llama 2 pre-trained models were trained for general large language applications, whereas the Llama 2 chat models were fine-tuned
320
+ for dialogue specific uses like chat bots.
321
+
322
+
323
+ Are we going to publish model cards and info for each of the 4 models (7B/13B - PT and FT)?
324
+
325
+
326
+ You can find the Llama 2 model card here.
327
+
328
+
329
+ What are some effective practices/ways that seem to be more effective at fine-tuning?
330
+
331
+
332
+ You can find some best practices in the fine-tuning section in our Getting started with Llama guide.
333
+
334
+
335
+ How effective is using LoRA?
336
+
337
+
338
+ LoRA has made the fine-tuning of LLM like Llama 2 possible on consumer GPUs (like Tesla T4) by only retraining a very small set of
339
+ model parameters, democratizing the fine-tuning of LLM, while still reaching comparable performance as fine-tuning the whole model on
340
+ much expensive GPUs. So LoRA is essential and very effective in Llama 2 fine-tuning.
341
+
342
+
343
+ How should we think about post processing (validate generated data) as a way to fine-tune models?
344
+
345
+
346
+ a. It depends on the application what type of data we are fine-tuning on, but it needs to be beyond normal harness eval sets, something
347
+ that makes sense for the application, for example for something like sql data, maybe running generate code would be a better eval. So
348
+ essentially having a truthful data on the specific application can be helpful to reduce the risk on a specific application
349
+
350
+
351
+
352
+ b. Also setting some sort of threshold such as prob>90% might be helpful to get more confidence in the output
353
+ What are the different libraries that we recommend for fine-tuning?
354
+
355
+ You can find some fine-tuning recommendations in the Llama 2 Github recipes repository as well as fine-tuning section in our Getting
356
+ started with Llama guide.
357
+
358
+
359
+ How can we identify the right ‘r’ value for LORA method for a certain use-case?
360
+
361
+
362
+ The best approach would be to review the LoRA research paper for more information on the rankings, then reviewing similar
363
+ implementations for other models and finally experimenting.
364
+
365
+
366
+ We hope to use prompt engineering as a lever to nudge behavior. Any pointers on enhancing instruction-
367
+ following by fine-tuning small llama models?
368
+
369
+ Take a look at the fine-tuning section in our Getting started with Llama guide for some pointers towards fine-tuning.
370
+
371
+
372
+
373
+
374
+ Prompting
375
+
376
+
377
+
378
+ Strategies to help models handle longer conversations?
379
+
380
+
381
+ You can find some helpful information towards this in the Prompting and Llama demo app.
382
+
383
+
384
+
385
+
386
+ Legal
387
+
388
+
389
+
390
+ Is Llama 2 open source? What is the exact license these models are published under?
391
+
392
+
393
+ 1. This is a bespoke commercial license that balances open access to the models with responsibility and protections in place to help
394
+ address potential misuse.
395
+ 2. Our license allows for broad commercial use, as well as for developers to create and redistribute additional work on top of Llama 2.
396
+ 3. We want to enable more innovation in both research and commercial use cases, but believe in taking a responsible approach to
397
+ releasing AI technologies.
398
+ 4. For more details, our license can be found here.
399
+
400
+
401
+ Is there any copyrighted material in any of the training datasets?
402
+
403
+
404
+ The model is trained on a subset of publicly available text-based datasets.
405
+
406
+
407
+
408
+
409
+ Benchmarking
410
+ Do we have any benchmarks for the v2 models? Are we planning to publish it? If so, do we have an ETA?
411
+
412
+
413
+ Yes we will publish benchmarks alongside the release. If there are particular benchmarks partners are interested in it may be possible to
414
+ share some under NDA earlier.
415
+
ktem_app_data/user_data/sql.db ADDED
Binary file (86 kB). View file
 
ktem_app_data/user_data/vectorstore/94b1c233-a69f-441c-a2b8-336ba11ee805/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
3
+ size 3212000
ktem_app_data/user_data/vectorstore/94b1c233-a69f-441c-a2b8-336ba11ee805/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
3
+ size 100
ktem_app_data/user_data/vectorstore/94b1c233-a69f-441c-a2b8-336ba11ee805/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d972500314b56e992903f5fb71da1e0b5d6fd9bdf0f0079d8fa61e7412419d9
3
+ size 4000
ktem_app_data/user_data/vectorstore/94b1c233-a69f-441c-a2b8-336ba11ee805/link_lists.bin ADDED
File without changes
ktem_app_data/user_data/vectorstore/chroma.sqlite3 ADDED
Binary file (393 kB). View file
 
requirements.txt CHANGED
@@ -1 +1 @@
1
- kotaemon-app @ git+https://github.com/Cinnamon/kotaemon.git@root-toml
 
1
+ kotaemon-app @ git+https://github.com/Cinnamon/kotaemon.git@hfhub-deploy