Update README.md
Browse files
README.md
CHANGED
@@ -48,7 +48,7 @@ question = "When was the bellagio in las vegas built?"
|
|
48 |
html = """
|
49 |
<html>
|
50 |
<head>
|
51 |
-
<
|
52 |
</head>
|
53 |
<body>
|
54 |
<p class="class0">The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
@@ -76,7 +76,7 @@ simplified_html = clean_html(html)
|
|
76 |
print(simplified_html)
|
77 |
|
78 |
# <html>
|
79 |
-
# <
|
80 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
81 |
# <div>
|
82 |
# <p>Some other text</p>
|
@@ -111,14 +111,14 @@ MAX_CONTEXT_WINDOW_GEN = 32
|
|
111 |
from htmlrag import build_block_tree
|
112 |
|
113 |
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
|
114 |
-
# block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
115 |
for block in block_tree:
|
116 |
print("Block Content: ", block[0])
|
117 |
print("Block Path: ", block[1])
|
118 |
print("Is Leaf: ", block[2])
|
119 |
print("")
|
120 |
|
121 |
-
# Block Content: <
|
122 |
# Block Path: ['html', 'title']
|
123 |
# Is Leaf: True
|
124 |
#
|
@@ -139,7 +139,7 @@ for block in block_tree:
|
|
139 |
```python
|
140 |
from htmlrag import EmbedHTMLPruner
|
141 |
|
142 |
-
embed_model="/
|
143 |
query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
|
144 |
embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
|
145 |
# alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
|
@@ -148,15 +148,15 @@ embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=Tru
|
|
148 |
block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
|
149 |
print(block_rankings)
|
150 |
|
151 |
-
# [
|
152 |
|
153 |
#. alternatively you can use bm25 to rank the blocks
|
154 |
from htmlrag import BM25HTMLPruner
|
155 |
bm25_html_pruner = BM25HTMLPruner()
|
156 |
-
block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
|
157 |
print(block_rankings)
|
158 |
|
159 |
-
# [
|
160 |
|
161 |
from transformers import AutoTokenizer
|
162 |
|
@@ -166,7 +166,7 @@ pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_ra
|
|
166 |
print(pruned_html)
|
167 |
|
168 |
# <html>
|
169 |
-
# <
|
170 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
171 |
# </html>
|
172 |
```
|
@@ -179,15 +179,15 @@ from htmlrag import GenHTMLPruner
|
|
179 |
import torch
|
180 |
|
181 |
# construct a finer block tree
|
182 |
-
block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
|
183 |
-
# block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
184 |
for block in block_tree:
|
185 |
print("Block Content: ", block[0])
|
186 |
print("Block Path: ", block[1])
|
187 |
print("Is Leaf: ", block[2])
|
188 |
print("")
|
189 |
|
190 |
-
# Block Content: <
|
191 |
# Block Path: ['html', 'title']
|
192 |
# Is Leaf: True
|
193 |
#
|
@@ -200,8 +200,8 @@ if torch.cuda.is_available():
|
|
200 |
device="cuda"
|
201 |
else:
|
202 |
device="cpu"
|
203 |
-
gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path,
|
204 |
-
block_rankings=gen_embed_pruner.calculate_block_rankings(question, pruned_html, block_tree)
|
205 |
print(block_rankings)
|
206 |
|
207 |
# [1, 0]
|
|
|
48 |
html = """
|
49 |
<html>
|
50 |
<head>
|
51 |
+
<h1>Bellagio Hotel in Las</h1>
|
52 |
</head>
|
53 |
<body>
|
54 |
<p class="class0">The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
|
|
76 |
print(simplified_html)
|
77 |
|
78 |
# <html>
|
79 |
+
# <h1>Bellagio Hotel in Las</h1>
|
80 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
81 |
# <div>
|
82 |
# <p>Some other text</p>
|
|
|
111 |
from htmlrag import build_block_tree
|
112 |
|
113 |
block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
|
114 |
+
# block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
115 |
for block in block_tree:
|
116 |
print("Block Content: ", block[0])
|
117 |
print("Block Path: ", block[1])
|
118 |
print("Is Leaf: ", block[2])
|
119 |
print("")
|
120 |
|
121 |
+
# Block Content: <h1>Bellagio Hotel in Las</h1>
|
122 |
# Block Path: ['html', 'title']
|
123 |
# Is Leaf: True
|
124 |
#
|
|
|
139 |
```python
|
140 |
from htmlrag import EmbedHTMLPruner
|
141 |
|
142 |
+
embed_model="BAAI/bge-large-en"
|
143 |
query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
|
144 |
embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
|
145 |
# alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
|
|
|
148 |
block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
|
149 |
print(block_rankings)
|
150 |
|
151 |
+
# [2, 0, 1]
|
152 |
|
153 |
#. alternatively you can use bm25 to rank the blocks
|
154 |
from htmlrag import BM25HTMLPruner
|
155 |
bm25_html_pruner = BM25HTMLPruner()
|
156 |
+
block_rankings = bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
|
157 |
print(block_rankings)
|
158 |
|
159 |
+
# [2, 0, 1]
|
160 |
|
161 |
from transformers import AutoTokenizer
|
162 |
|
|
|
166 |
print(pruned_html)
|
167 |
|
168 |
# <html>
|
169 |
+
# <h1>Bellagio Hotel in Las</h1>
|
170 |
# <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
|
171 |
# </html>
|
172 |
```
|
|
|
179 |
import torch
|
180 |
|
181 |
# construct a finer block tree
|
182 |
+
block_tree, pruned_html = build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
|
183 |
+
# block_tree, pruned_html = build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
|
184 |
for block in block_tree:
|
185 |
print("Block Content: ", block[0])
|
186 |
print("Block Path: ", block[1])
|
187 |
print("Is Leaf: ", block[2])
|
188 |
print("")
|
189 |
|
190 |
+
# Block Content: <h1>Bellagio Hotel in Las</h1>
|
191 |
# Block Path: ['html', 'title']
|
192 |
# Is Leaf: True
|
193 |
#
|
|
|
200 |
device="cuda"
|
201 |
else:
|
202 |
device="cpu"
|
203 |
+
gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, device=device)
|
204 |
+
block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html, block_tree)
|
205 |
print(block_rankings)
|
206 |
|
207 |
# [1, 0]
|