|
from langchain.docstore.document import Document |
|
import feedparser |
|
import html2text |
|
import ssl |
|
import time |
|
|
|
|
|
class RSS_Url_loader: |
|
def __init__(self, urls=None,interval=60): |
|
'''可用参数urls数组或者是字符串形式的url列表''' |
|
self.urls = [] |
|
self.interval = interval |
|
if urls is not None: |
|
try: |
|
if isinstance(urls, str): |
|
urls = [urls] |
|
elif isinstance(urls, list): |
|
pass |
|
else: |
|
raise TypeError('urls must be a list or a string.') |
|
self.urls = urls |
|
except: |
|
Warning('urls must be a list or a string.') |
|
|
|
|
|
def scheduled_execution(self): |
|
while True: |
|
docs = self.load() |
|
return docs |
|
time.sleep(self.interval) |
|
|
|
def load(self): |
|
if hasattr(ssl, '_create_unverified_context'): |
|
ssl._create_default_https_context = ssl._create_unverified_context |
|
documents = [] |
|
for url in self.urls: |
|
parsed = feedparser.parse(url) |
|
for entry in parsed.entries: |
|
if "content" in entry: |
|
data = entry.content[0].value |
|
else: |
|
data = entry.description or entry.summary |
|
data = html2text.html2text(data) |
|
metadata = {"title": entry.title, "link": entry.link} |
|
documents.append(Document(page_content=data, metadata=metadata)) |
|
return documents |
|
|
|
if __name__=="__main__": |
|
|
|
urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"] |
|
loader = RSS_Url_loader(urls) |
|
docs = loader.load() |
|
for doc in docs: |
|
print(doc) |