timep12345 commited on
Commit
101dfab
·
1 Parent(s): 0b8eb3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -1
app.py CHANGED
@@ -8,8 +8,10 @@ from langchain.llms import HuggingFaceHub
8
  from langchain.embeddings import HuggingFaceHubEmbeddings
9
  from langchain.vectorstores import Chroma
10
  from langchain.chains import RetrievalQA
 
11
  from trafilatura import fetch_url, extract
12
  from trafilatura.spider import focused_crawler
 
13
 
14
 
15
 
@@ -20,11 +22,14 @@ def url_changes(url, pages_to_visit, urls_to_scrape, repo_id):
20
  to_visit, links = focused_crawler(url, max_seen_urls=pages_to_visit, max_known_urls=urls_to_scrape)
21
  print(f"{len(links)} to be crawled")
22
 
 
 
 
23
  results_df = pd.DataFrame()
24
  for url in links:
25
  downloaded = fetch_url(url)
26
  if downloaded:
27
- result = extract(downloaded, output_format='json')
28
  result = json.loads(result)
29
 
30
  results_df = pd.concat([results_df, pd.DataFrame.from_records([result])])
 
8
  from langchain.embeddings import HuggingFaceHubEmbeddings
9
  from langchain.vectorstores import Chroma
10
  from langchain.chains import RetrievalQA
11
+
12
  from trafilatura import fetch_url, extract
13
  from trafilatura.spider import focused_crawler
14
+ from trafilatura.settings import use_config
15
 
16
 
17
 
 
22
  to_visit, links = focused_crawler(url, max_seen_urls=pages_to_visit, max_known_urls=urls_to_scrape)
23
  print(f"{len(links)} to be crawled")
24
 
25
+ config = use_config()
26
+ config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0")
27
+
28
  results_df = pd.DataFrame()
29
  for url in links:
30
  downloaded = fetch_url(url)
31
  if downloaded:
32
+ result = extract(downloaded, output_format='json', config=config)
33
  result = json.loads(result)
34
 
35
  results_df = pd.concat([results_df, pd.DataFrame.from_records([result])])