Spaces:
Runtime error
Runtime error
Commit
·
101dfab
1
Parent(s):
0b8eb3e
Update app.py
Browse files
app.py
CHANGED
@@ -8,8 +8,10 @@ from langchain.llms import HuggingFaceHub
|
|
8 |
from langchain.embeddings import HuggingFaceHubEmbeddings
|
9 |
from langchain.vectorstores import Chroma
|
10 |
from langchain.chains import RetrievalQA
|
|
|
11 |
from trafilatura import fetch_url, extract
|
12 |
from trafilatura.spider import focused_crawler
|
|
|
13 |
|
14 |
|
15 |
|
@@ -20,11 +22,14 @@ def url_changes(url, pages_to_visit, urls_to_scrape, repo_id):
|
|
20 |
to_visit, links = focused_crawler(url, max_seen_urls=pages_to_visit, max_known_urls=urls_to_scrape)
|
21 |
print(f"{len(links)} to be crawled")
|
22 |
|
|
|
|
|
|
|
23 |
results_df = pd.DataFrame()
|
24 |
for url in links:
|
25 |
downloaded = fetch_url(url)
|
26 |
if downloaded:
|
27 |
-
result = extract(downloaded, output_format='json')
|
28 |
result = json.loads(result)
|
29 |
|
30 |
results_df = pd.concat([results_df, pd.DataFrame.from_records([result])])
|
|
|
8 |
from langchain.embeddings import HuggingFaceHubEmbeddings
|
9 |
from langchain.vectorstores import Chroma
|
10 |
from langchain.chains import RetrievalQA
|
11 |
+
|
12 |
from trafilatura import fetch_url, extract
|
13 |
from trafilatura.spider import focused_crawler
|
14 |
+
from trafilatura.settings import use_config
|
15 |
|
16 |
|
17 |
|
|
|
22 |
to_visit, links = focused_crawler(url, max_seen_urls=pages_to_visit, max_known_urls=urls_to_scrape)
|
23 |
print(f"{len(links)} to be crawled")
|
24 |
|
25 |
+
config = use_config()
|
26 |
+
config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
|
27 |
+
|
28 |
results_df = pd.DataFrame()
|
29 |
for url in links:
|
30 |
downloaded = fetch_url(url)
|
31 |
if downloaded:
|
32 |
+
result = extract(downloaded, output_format='json', config=config)
|
33 |
result = json.loads(result)
|
34 |
|
35 |
results_df = pd.concat([results_df, pd.DataFrame.from_records([result])])
|