Spaces:
Running
Running
Update helper_functions_api.py
Browse files- helper_functions_api.py +16 -8
helper_functions_api.py
CHANGED
@@ -67,6 +67,7 @@ from half_json.core import JSONFixer
|
|
67 |
from openai import OpenAI
|
68 |
from together import Together
|
69 |
from urllib.parse import urlparse
|
|
|
70 |
|
71 |
llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
|
72 |
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
|
@@ -197,13 +198,20 @@ class Scraper:
|
|
197 |
return None
|
198 |
|
199 |
def extract_main_content(html):
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
def process_content(data_format, url, query):
|
209 |
scraper = Scraper()
|
@@ -213,7 +221,7 @@ def process_content(data_format, url, query):
|
|
213 |
if content:
|
214 |
rephrased_content = rephrase_content(
|
215 |
data_format=data_format,
|
216 |
-
content=limit_tokens(remove_stopwords(content), token_limit=
|
217 |
query=query,
|
218 |
)
|
219 |
return rephrased_content, url
|
|
|
67 |
from openai import OpenAI
|
68 |
from together import Together
|
69 |
from urllib.parse import urlparse
|
70 |
+
import trafilatura
|
71 |
|
72 |
llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
|
73 |
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
|
|
|
198 |
return None
|
199 |
|
200 |
def extract_main_content(html):
|
201 |
+
extracted = trafilatura.extract(
|
202 |
+
html,
|
203 |
+
output_format="markdown",
|
204 |
+
target_language="en",
|
205 |
+
include_tables=True,
|
206 |
+
include_images=False,
|
207 |
+
include_links=False,
|
208 |
+
deduplicate=True,
|
209 |
+
)
|
210 |
+
|
211 |
+
if extracted:
|
212 |
+
return trafilatura.utils.sanitize(extracted)
|
213 |
+
else:
|
214 |
+
return ""
|
215 |
|
216 |
def process_content(data_format, url, query):
|
217 |
scraper = Scraper()
|
|
|
221 |
if content:
|
222 |
rephrased_content = rephrase_content(
|
223 |
data_format=data_format,
|
224 |
+
content=limit_tokens(remove_stopwords(content), token_limit=4000),
|
225 |
query=query,
|
226 |
)
|
227 |
return rephrased_content, url
|