Spaces:

llmbb
/

LLMBB-Agent

Running

vlff李飞飞

update md

2319518 10 months ago

4.68 kB

	import datetime
	import os
	import re
	from urllib.parse import unquote, urlparse

	import add_qwen_libs # NOQA
	import jsonlines

	from qwen_agent.log import logger
	from qwen_agent.utils.doc_parser import parse_doc, parse_html_bs
	from qwen_agent.utils.utils import print_traceback, save_text_to_file
	from qwen_server.schema import Record


	def is_local_path(path):
	if path.startswith('https://') or path.startswith('http://'):
	return False
	return True


	def sanitize_chrome_file_path(file_path: str) -> str:
	# For Linux and macOS.
	if os.path.exists(file_path):
	return file_path

	# For native Windows, drop the leading '/' in '/C:/'
	win_path = file_path
	if win_path.startswith('/'):
	win_path = win_path[1:]
	if os.path.exists(win_path):
	return win_path

	# For Windows + WSL.
	if re.match(r'^[A-Za-z]:/', win_path):
	wsl_path = f'/mnt/{win_path[0].lower()}/{win_path[3:]}'
	if os.path.exists(wsl_path):
	return wsl_path

	# For native Windows, replace / with \.
	win_path = win_path.replace('/', '\\')
	if os.path.exists(win_path):
	return win_path

	return file_path


	def extract_and_cache_document(data, cache_file, cache_root):
	logger.info('Starting cache pages...')
	if data['url'].split('.')[-1].lower() in ['pdf', 'docx', 'pptx']:
	date1 = datetime.datetime.now()

	# generate one processing record
	new_record = Record(url=data['url'],
	time='',
	type=data['type'],
	raw=[],
	extract='',
	topic='',
	checked=False,
	session=[]).to_dict()
	with jsonlines.open(cache_file, mode='a') as writer:
	writer.write(new_record)

	if data['url'].startswith('https://') or data['url'].startswith(
	'http://'):
	pdf_path = data['url']
	else:
	parsed_url = urlparse(data['url'])
	pdf_path = unquote(parsed_url.path)
	pdf_path = sanitize_chrome_file_path(pdf_path)

	try:
	pdf_content = parse_doc(pdf_path)
	except Exception:
	print_traceback()
	# del the processing record
	lines = []
	if os.path.exists(cache_file):
	for line in jsonlines.open(cache_file):
	if line['url'] != data['url']:
	lines.append(line)
	with jsonlines.open(cache_file, mode='w') as writer:
	for new_line in lines:
	writer.write(new_line)
	return 'failed'

	date2 = datetime.datetime.now()
	logger.info('Parsing pdf time: ' + str(date2 - date1))
	data['content'] = pdf_content
	data['type'] = 'pdf'
	extract = pdf_path.split('/')[-1].split('\\')[-1].split('.')[0]
	elif data['content'] and data['type'] == 'html':
	new_record = Record(url=data['url'],
	time='',
	type=data['type'],
	raw=[],
	extract='',
	topic='',
	checked=False,
	session=[]).to_dict()
	with jsonlines.open(cache_file, mode='a') as writer:
	writer.write(new_record)

	try:
	tmp_html_file = os.path.join(cache_root, 'tmp.html')
	save_text_to_file(tmp_html_file, data['content'])
	data['content'] = parse_html_bs(tmp_html_file)
	except Exception:
	print_traceback()
	extract = data['content'][0]['metadata']['title']
	else:
	logger.error(
	'Only Support the Following File Types: [\'.html\', \'.pdf\', \'.docx\', \'.pptx\']'
	)
	raise NotImplementedError

	today = datetime.date.today()
	new_record = Record(url=data['url'],
	time=str(today),
	type=data['type'],
	raw=data['content'],
	extract=extract,
	topic='',
	checked=True,
	session=[])
	lines = []
	if os.path.exists(cache_file):
	for line in jsonlines.open(cache_file):
	if line['url'] != data['url']:
	lines.append(line)
	lines.append(new_record.to_dict()) # cache
	with jsonlines.open(cache_file, mode='w') as writer:
	for new_line in lines:
	writer.write(new_line)

	response = 'Cached'
	return response