zstanjj commited on
Commit
e5d0cb5
·
verified ·
1 Parent(s): 9c01983

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -14
README.md CHANGED
@@ -48,7 +48,7 @@ question = "When was the bellagio in las vegas built?"
48
  html = """
49
  <html>
50
  <head>
51
- <title>When was the bellagio in las vegas built?</title>
52
  </head>
53
  <body>
54
  <p class="class0">The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
@@ -76,7 +76,7 @@ simplified_html = clean_html(html)
76
  print(simplified_html)
77
 
78
  # <html>
79
- # <title>When was the bellagio in las vegas built?</title>
80
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
81
  # <div>
82
  # <p>Some other text</p>
@@ -111,14 +111,14 @@ MAX_CONTEXT_WINDOW_GEN = 32
111
  from htmlrag import build_block_tree
112
 
113
  block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
114
- # block_tree, simplified_html=build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
115
  for block in block_tree:
116
  print("Block Content: ", block[0])
117
  print("Block Path: ", block[1])
118
  print("Is Leaf: ", block[2])
119
  print("")
120
 
121
- # Block Content: <title>When was the bellagio in las vegas built?</title>
122
  # Block Path: ['html', 'title']
123
  # Is Leaf: True
124
  #
@@ -139,7 +139,7 @@ for block in block_tree:
139
  ```python
140
  from htmlrag import EmbedHTMLPruner
141
 
142
- embed_model="/train_data_load/huggingface/tjj_hf/bge-large-en/"
143
  query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
144
  embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
145
  # alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
@@ -148,15 +148,15 @@ embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=Tru
148
  block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
149
  print(block_rankings)
150
 
151
- # [0, 2, 1]
152
 
153
  #. alternatively you can use bm25 to rank the blocks
154
  from htmlrag import BM25HTMLPruner
155
  bm25_html_pruner = BM25HTMLPruner()
156
- block_rankings=bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
157
  print(block_rankings)
158
 
159
- # [0, 2, 1]
160
 
161
  from transformers import AutoTokenizer
162
 
@@ -166,7 +166,7 @@ pruned_html = embed_html_pruner.prune_HTML(simplified_html, block_tree, block_ra
166
  print(pruned_html)
167
 
168
  # <html>
169
- # <title>When was the bellagio in las vegas built?</title>
170
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
171
  # </html>
172
  ```
@@ -179,15 +179,15 @@ from htmlrag import GenHTMLPruner
179
  import torch
180
 
181
  # construct a finer block tree
182
- block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
183
- # block_tree, pruned_html=build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
184
  for block in block_tree:
185
  print("Block Content: ", block[0])
186
  print("Block Path: ", block[1])
187
  print("Is Leaf: ", block[2])
188
  print("")
189
 
190
- # Block Content: <title>When was the bellagio in las vegas built?</title>
191
  # Block Path: ['html', 'title']
192
  # Is Leaf: True
193
  #
@@ -200,8 +200,8 @@ if torch.cuda.is_available():
200
  device="cuda"
201
  else:
202
  device="cpu"
203
- gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, max_node_words=MAX_NODE_WORDS_GEN, device=device)
204
- block_rankings=gen_embed_pruner.calculate_block_rankings(question, pruned_html, block_tree)
205
  print(block_rankings)
206
 
207
  # [1, 0]
 
48
  html = """
49
  <html>
50
  <head>
51
+ <h1>Bellagio Hotel in Las</h1>
52
  </head>
53
  <body>
54
  <p class="class0">The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
 
76
  print(simplified_html)
77
 
78
  # <html>
79
+ # <h1>Bellagio Hotel in Las</h1>
80
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
81
  # <div>
82
  # <p>Some other text</p>
 
111
  from htmlrag import build_block_tree
112
 
113
  block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_EMBED)
114
+ # block_tree, simplified_html = build_block_tree(simplified_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
115
  for block in block_tree:
116
  print("Block Content: ", block[0])
117
  print("Block Path: ", block[1])
118
  print("Is Leaf: ", block[2])
119
  print("")
120
 
121
+ # Block Content: <h1>Bellagio Hotel in Las</h1>
122
  # Block Path: ['html', 'title']
123
  # Is Leaf: True
124
  #
 
139
  ```python
140
  from htmlrag import EmbedHTMLPruner
141
 
142
+ embed_model="BAAI/bge-large-en"
143
  query_instruction_for_retrieval = "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
144
  embed_html_pruner = EmbedHTMLPruner(embed_model=embed_model, local_inference=True, query_instruction_for_retrieval = query_instruction_for_retrieval)
145
  # alternatively you can init a remote TEI model, refer to https://github.com/huggingface/text-embeddings-inference.
 
148
  block_rankings=embed_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
149
  print(block_rankings)
150
 
151
+ # [2, 0, 1]
152
 
153
  #. alternatively you can use bm25 to rank the blocks
154
  from htmlrag import BM25HTMLPruner
155
  bm25_html_pruner = BM25HTMLPruner()
156
+ block_rankings = bm25_html_pruner.calculate_block_rankings(question, simplified_html, block_tree)
157
  print(block_rankings)
158
 
159
+ # [2, 0, 1]
160
 
161
  from transformers import AutoTokenizer
162
 
 
166
  print(pruned_html)
167
 
168
  # <html>
169
+ # <h1>Bellagio Hotel in Las</h1>
170
  # <p>The Bellagio is a luxury hotel and casino located on the Las Vegas Strip in Paradise, Nevada. It was built in 1998.</p>
171
  # </html>
172
  ```
 
179
  import torch
180
 
181
  # construct a finer block tree
182
+ block_tree, pruned_html = build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN)
183
+ # block_tree, pruned_html = build_block_tree(pruned_html, max_node_words=MAX_NODE_WORDS_GEN, zh_char=True) # for Chinese text
184
  for block in block_tree:
185
  print("Block Content: ", block[0])
186
  print("Block Path: ", block[1])
187
  print("Is Leaf: ", block[2])
188
  print("")
189
 
190
+ # Block Content: <h1>Bellagio Hotel in Las</h1>
191
  # Block Path: ['html', 'title']
192
  # Is Leaf: True
193
  #
 
200
  device="cuda"
201
  else:
202
  device="cpu"
203
+ gen_embed_pruner = GenHTMLPruner(gen_model=ckpt_path, device=device)
204
+ block_rankings = gen_embed_pruner.calculate_block_rankings(question, pruned_html, block_tree)
205
  print(block_rankings)
206
 
207
  # [1, 0]