rdose commited on
Commit
b98d20d
1 Parent(s): 96edcb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -18,7 +18,10 @@ except ImportError:
18
  except ImportError:
19
  try:
20
  import trafilatura
 
21
  EXTRACTOR_NET = 'trafilatura'
 
 
22
  except ImportError:
23
  raise ImportError
24
 
@@ -301,7 +304,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
301
  extracted = extract_content(requests.get(url).content)
302
  input_batch_content.append(extracted)
303
  elif(EXTRACTOR_NET == 'trafilatura'):
304
- extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False)
305
  input_batch_content.append(extracted)
306
  else:
307
  print("[i] Data is news contents")
 
18
  except ImportError:
19
  try:
20
  import trafilatura
21
+ from trafilatura.settings import use_config
22
  EXTRACTOR_NET = 'trafilatura'
23
+ trafilatura_config = use_config()
24
+ trafilatura_config.set("DEFAULT", "EXTRACTION_TIMEOUT", "0") #To avoid it runnig signals to avoid clashing with gradio threads
25
  except ImportError:
26
  raise ImportError
27
 
 
304
  extracted = extract_content(requests.get(url).content)
305
  input_batch_content.append(extracted)
306
  elif(EXTRACTOR_NET == 'trafilatura'):
307
+ extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
308
  input_batch_content.append(extracted)
309
  else:
310
  print("[i] Data is news contents")