Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -306,7 +306,16 @@ def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies
|
|
306 |
extracted = extract_content(requests.get(url).content)
|
307 |
input_batch_content.append(extracted)
|
308 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
input_batch_content.append(extracted)
|
311 |
else:
|
312 |
print("[i] Data is news contents")
|
@@ -349,7 +358,7 @@ def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies
|
|
349 |
if ner_labels[idx]: #not empty
|
350 |
for ner in ner_labels[idx]:
|
351 |
if filt_companies_topic:
|
352 |
-
if news_sectors[idx]
|
353 |
continue
|
354 |
dfo = pd.concat( [dfo, df.loc[[idx]].assign(company=ner[0], sector=ner[1], symbol=ner[2])], join='outer', ignore_index=True) #axis=0
|
355 |
print("[i] Pandas output shape:",dfo.shape)
|
|
|
306 |
extracted = extract_content(requests.get(url).content)
|
307 |
input_batch_content.append(extracted)
|
308 |
elif(EXTRACTOR_NET == 'trafilatura'):
|
309 |
+
try:
|
310 |
+
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
|
311 |
+
except:
|
312 |
+
archive = is_in_archive(url)
|
313 |
+
if archive['archived']:
|
314 |
+
print("[W] Using archive.org version of",url)
|
315 |
+
url = archive['url']
|
316 |
+
extracted = trafilatura.extract(trafilatura.fetch_url(url), include_comments=False, config=trafilatura_config, include_tables=False)
|
317 |
+
else:
|
318 |
+
print("[E] URL=",url,"not found")
|
319 |
input_batch_content.append(extracted)
|
320 |
else:
|
321 |
print("[i] Data is news contents")
|
|
|
358 |
if ner_labels[idx]: #not empty
|
359 |
for ner in ner_labels[idx]:
|
360 |
if filt_companies_topic:
|
361 |
+
if news_sectors[idx][0] not in ner[1]:
|
362 |
continue
|
363 |
dfo = pd.concat( [dfo, df.loc[[idx]].assign(company=ner[0], sector=ner[1], symbol=ner[2])], join='outer', ignore_index=True) #axis=0
|
364 |
print("[i] Pandas output shape:",dfo.shape)
|