Spaces:

NeuML
/

txtai

Running

txtai / textractor.py

Update textractor.py

daa1a9a over 2 years ago

No virus

589 Bytes

	"""
	Textractor module
	"""

	import requests

	from bs4 import BeautifulSoup

	from txtai.pipeline.segmentation import Segmentation

	class Textractor(Segmentation):
	"""
	Extracts text from files.
	"""

	def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
	super().__init__(sentences, lines, paragraphs, minlength, join)

	def text(self, text):
	# text is a url
	response = requests.get(text)
	html = response.text

	soup = BeautifulSoup(html, features="html.parser")
	return soup.get_text()