peter-v vincentclaes commited on
Commit
aa936e5
0 Parent(s):

Duplicate from drift-ai/faq-website

Browse files

Co-authored-by: claes <vincentclaes@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitignore +129 -0
  2. README.md +15 -0
  3. app.py +149 -0
  4. requirements.txt +10 -0
  5. scrape_website.py +66 -0
.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FAQ a Website
3
+ emoji: 🦙
4
+ colorFrom: white
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: drift-ai/faq-website
12
+ ---
13
+
14
+ # Faq A website
15
+ repo for the code to QA content form a website
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from peft import PeftModel
3
+ import transformers
4
+ import gradio as gr
5
+ from scrape_website import process_webpage
6
+ assert (
7
+ "LlamaTokenizer" in transformers._import_structure["models.llama"]
8
+ ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
9
+ from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
10
+
11
+ tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
12
+
13
+ BASE_MODEL = "decapoda-research/llama-7b-hf"
14
+ LORA_WEIGHTS = "tloen/alpaca-lora-7b"
15
+
16
+ if torch.cuda.is_available():
17
+ device = "cuda"
18
+ else:
19
+ device = "cpu"
20
+
21
+ try:
22
+ if torch.backends.mps.is_available():
23
+ device = "mps"
24
+ except:
25
+ pass
26
+
27
+ if device == "cuda":
28
+ model = LlamaForCausalLM.from_pretrained(
29
+ BASE_MODEL,
30
+ load_in_8bit=False,
31
+ torch_dtype=torch.float16,
32
+ device_map="auto",
33
+ )
34
+ model = PeftModel.from_pretrained(
35
+ model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
36
+ )
37
+ elif device == "mps":
38
+ model = LlamaForCausalLM.from_pretrained(
39
+ BASE_MODEL,
40
+ device_map={"": device},
41
+ torch_dtype=torch.float16,
42
+ )
43
+ model = PeftModel.from_pretrained(
44
+ model,
45
+ LORA_WEIGHTS,
46
+ device_map={"": device},
47
+ torch_dtype=torch.float16,
48
+ )
49
+ else:
50
+ model = LlamaForCausalLM.from_pretrained(
51
+ BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
52
+ )
53
+ model = PeftModel.from_pretrained(
54
+ model,
55
+ LORA_WEIGHTS,
56
+ device_map={"": device},
57
+ )
58
+
59
+
60
+ def generate_prompt(instruction, input=None):
61
+ if input:
62
+ return f"""Below is an url that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
63
+ ### Instruction:
64
+ {instruction}
65
+ ### Input:
66
+ {input}
67
+ ### Response:"""
68
+ else:
69
+ return f"""Below is an url that describes a task. Write a response that appropriately completes the request.
70
+ ### Instruction:
71
+ {instruction}
72
+ ### Response:"""
73
+
74
+ if device != "cpu":
75
+ model.half()
76
+ model.eval()
77
+ if torch.__version__ >= "2":
78
+ model = torch.compile(model)
79
+
80
+
81
+ def evaluate(
82
+ instruction,
83
+ url,
84
+ temperature=0.1,
85
+ top_p=0.75,
86
+ top_k=40,
87
+ num_beams=4,
88
+ max_new_tokens=128,
89
+ **kwargs,
90
+ ):
91
+ content = process_webpage(url=url)
92
+ # avoid GPU memory overflow
93
+ with torch.no_grad():
94
+ torch.cuda.empty_cache()
95
+ prompt = generate_prompt(instruction, content)
96
+ inputs = tokenizer(prompt, return_tensors="pt")
97
+ input_ids = inputs["input_ids"].to(device)
98
+ generation_config = GenerationConfig(
99
+ temperature=temperature,
100
+ top_p=top_p,
101
+ top_k=top_k,
102
+ num_beams=num_beams,
103
+ **kwargs,
104
+ )
105
+ generation_output = model.generate(
106
+ input_ids=input_ids,
107
+ generation_config=generation_config,
108
+ return_dict_in_generate=True,
109
+ output_scores=True,
110
+ max_new_tokens=max_new_tokens,
111
+ )
112
+ s = generation_output.sequences[0]
113
+ output = tokenizer.decode(s)
114
+ # avoid GPU memory overflow
115
+ torch.cuda.empty_cache()
116
+ return output.split("### Response:")[1].strip()
117
+
118
+
119
+ g = gr.Interface(
120
+ fn=evaluate,
121
+ inputs=[
122
+ gr.components.Textbox(
123
+ lines=2, label="FAQ", placeholder="Ask me anything about this website?"
124
+ ),
125
+ gr.components.Textbox(lines=1, label="Website URL", placeholder="https://www.meet-drift.ai/"),
126
+ # gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
127
+ # gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
128
+ # gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
129
+ # gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
130
+ # gr.components.Slider(
131
+ # minimum=1, maximum=512, step=1, value=128, label="Max tokens"
132
+ # ),
133
+ ],
134
+ outputs=[
135
+ gr.inputs.Textbox(
136
+ lines=5,
137
+ label="Output",
138
+ )
139
+ ],
140
+ title="FAQ A Website",
141
+ examples=[
142
+ ["Can you list the capabilities this company has in bullet points?", "https://www.meet-drift.ai/"],
143
+ ["What's the name of the founder?", "https://www.meet-drift.ai/about"],
144
+ ["in 1 word what's the service the company is providing?", "https://www.meet-drift.ai/"],
145
+ ]
146
+ # description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
147
+ )
148
+ g.queue(concurrency_count=1)
149
+ g.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ requests
3
+ datasets
4
+ loralib
5
+ sentencepiece
6
+ git+https://github.com/huggingface/transformers.git
7
+ accelerate
8
+ bitsandbytes
9
+ git+https://github.com/huggingface/peft.git
10
+ gradio
scrape_website.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ TOKEN_CUT_OFF = 2500
5
+
6
+ def process_webpage(url:str):
7
+ # A set to keep track of visited pages
8
+ visited_pages = set()
9
+
10
+ text_list = []
11
+
12
+ # A function to recursively get all child pages
13
+ def get_child_pages(url):
14
+ # Make a GET request to the page and get the HTML content
15
+ response = requests.get(url)
16
+ html_content = response.content
17
+
18
+ # Parse the HTML content using BeautifulSoup
19
+ soup = BeautifulSoup(html_content, "html.parser")
20
+
21
+ # Get all the text content from the relevant HTML tags
22
+ text_content = ""
23
+ for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
24
+ for element in soup.find_all(tag):
25
+ text_content += element.get_text() + " "
26
+
27
+ # Add the page to the set of visited pages
28
+ text_content = f"page {url} contains: " + text_content
29
+ visited_pages.add(url)
30
+
31
+ # Find all the child links and recursively get their text content
32
+ for link in soup.find_all("a"):
33
+ href = link.get("href")
34
+ if href and href not in visited_pages and url in href:
35
+ get_child_pages(href)
36
+
37
+ text_list.append(text_content)
38
+
39
+ # Get the text content of the landing page
40
+ # get_child_pages(url)
41
+
42
+ # Make a GET request to the page and get the HTML content
43
+ response = requests.get(url)
44
+ html_content = response.content
45
+
46
+ # Parse the HTML content using BeautifulSoup
47
+ soup = BeautifulSoup(html_content, "html.parser")
48
+
49
+ # Get all the text content from the relevant HTML tags
50
+ text_content = ""
51
+ for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
52
+ for element in soup.find_all(tag):
53
+ text_content += element.get_text() + " "
54
+
55
+ # # make main page as first item
56
+ # text_list.reverse()
57
+ # text_list_cut_off = text_list[:TOKEN_CUT_OFF]
58
+ # page_content = "\n".join(text_list_cut_off)
59
+ # # Print the text content of the landing page and all child pages
60
+ # print(page_content)
61
+ # return page_content
62
+ print(text_content)
63
+ return text_content
64
+
65
+ if __name__ == '__main__':
66
+ process_webpage(url="https://www.meet-drift.ai/")