Spaces:
Running
Running
File size: 4,684 Bytes
2319518 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import datetime
import os
import re
from urllib.parse import unquote, urlparse
import add_qwen_libs # NOQA
import jsonlines
from qwen_agent.log import logger
from qwen_agent.utils.doc_parser import parse_doc, parse_html_bs
from qwen_agent.utils.utils import print_traceback, save_text_to_file
from qwen_server.schema import Record
def is_local_path(path):
if path.startswith('https://') or path.startswith('http://'):
return False
return True
def sanitize_chrome_file_path(file_path: str) -> str:
# For Linux and macOS.
if os.path.exists(file_path):
return file_path
# For native Windows, drop the leading '/' in '/C:/'
win_path = file_path
if win_path.startswith('/'):
win_path = win_path[1:]
if os.path.exists(win_path):
return win_path
# For Windows + WSL.
if re.match(r'^[A-Za-z]:/', win_path):
wsl_path = f'/mnt/{win_path[0].lower()}/{win_path[3:]}'
if os.path.exists(wsl_path):
return wsl_path
# For native Windows, replace / with \.
win_path = win_path.replace('/', '\\')
if os.path.exists(win_path):
return win_path
return file_path
def extract_and_cache_document(data, cache_file, cache_root):
logger.info('Starting cache pages...')
if data['url'].split('.')[-1].lower() in ['pdf', 'docx', 'pptx']:
date1 = datetime.datetime.now()
# generate one processing record
new_record = Record(url=data['url'],
time='',
type=data['type'],
raw=[],
extract='',
topic='',
checked=False,
session=[]).to_dict()
with jsonlines.open(cache_file, mode='a') as writer:
writer.write(new_record)
if data['url'].startswith('https://') or data['url'].startswith(
'http://'):
pdf_path = data['url']
else:
parsed_url = urlparse(data['url'])
pdf_path = unquote(parsed_url.path)
pdf_path = sanitize_chrome_file_path(pdf_path)
try:
pdf_content = parse_doc(pdf_path)
except Exception:
print_traceback()
# del the processing record
lines = []
if os.path.exists(cache_file):
for line in jsonlines.open(cache_file):
if line['url'] != data['url']:
lines.append(line)
with jsonlines.open(cache_file, mode='w') as writer:
for new_line in lines:
writer.write(new_line)
return 'failed'
date2 = datetime.datetime.now()
logger.info('Parsing pdf time: ' + str(date2 - date1))
data['content'] = pdf_content
data['type'] = 'pdf'
extract = pdf_path.split('/')[-1].split('\\')[-1].split('.')[0]
elif data['content'] and data['type'] == 'html':
new_record = Record(url=data['url'],
time='',
type=data['type'],
raw=[],
extract='',
topic='',
checked=False,
session=[]).to_dict()
with jsonlines.open(cache_file, mode='a') as writer:
writer.write(new_record)
try:
tmp_html_file = os.path.join(cache_root, 'tmp.html')
save_text_to_file(tmp_html_file, data['content'])
data['content'] = parse_html_bs(tmp_html_file)
except Exception:
print_traceback()
extract = data['content'][0]['metadata']['title']
else:
logger.error(
'Only Support the Following File Types: [\'.html\', \'.pdf\', \'.docx\', \'.pptx\']'
)
raise NotImplementedError
today = datetime.date.today()
new_record = Record(url=data['url'],
time=str(today),
type=data['type'],
raw=data['content'],
extract=extract,
topic='',
checked=True,
session=[])
lines = []
if os.path.exists(cache_file):
for line in jsonlines.open(cache_file):
if line['url'] != data['url']:
lines.append(line)
lines.append(new_record.to_dict()) # cache
with jsonlines.open(cache_file, mode='w') as writer:
for new_line in lines:
writer.write(new_line)
response = 'Cached'
return response
|