Spaces:

VIDraft
/

TeXray-backup

Running

App Files Files Community

TeXray-backup / text_utils.py

openfree

Update text_utils.py

92f2fb2 verified 13 days ago

raw

history blame contribute delete

14.2 kB

	# ============================================
	# text_utils.py
	# 파일 추출, 웹 검색, 기본 텍스트 처리 함수들
	# ============================================

	import re, os, json, time, zipfile, tempfile, zlib
	from pathlib import Path
	from collections import Counter
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from xml.etree import ElementTree as ET

	try:
	import httpx
	HAS_HTTPX = True
	except ImportError:
	HAS_HTTPX = False

	try:
	import pdfplumber
	HAS_PDFPLUMBER = True
	except ImportError:
	HAS_PDFPLUMBER = False

	try:
	import PyPDF2
	HAS_PYPDF2 = True
	except ImportError:
	HAS_PYPDF2 = False

	try:
	from docx import Document as DocxDocument
	HAS_DOCX = True
	except ImportError:
	HAS_DOCX = False

	try:
	import olefile
	HAS_OLEFILE = True
	except ImportError:
	HAS_OLEFILE = False

	# ============================================
	# 파일 추출 함수들
	# ============================================

	def extract_text_from_pdf(file_path):
	"""PDF → 텍스트"""
	pages = []
	if HAS_PDFPLUMBER:
	try:
	with pdfplumber.open(file_path) as pdf:
	for p in pdf.pages:
	t = p.extract_text()
	if t: pages.append(t)
	if pages: return pages, None
	except Exception as e:
	print(f"pdfplumber: {e}")
	if HAS_PYPDF2:
	try:
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	for p in reader.pages:
	t = p.extract_text()
	if t: pages.append(t)
	if pages: return pages, None
	except Exception as e:
	print(f"PyPDF2: {e}")
	return None, "PDF 추출 실패"

	def extract_text_from_docx(file_path):
	"""DOCX → 텍스트"""
	if not HAS_DOCX: return None, "python-docx 없음"
	try:
	doc = DocxDocument(file_path)
	sections = []
	current = []
	for para in doc.paragraphs:
	txt = para.text.strip()
	if not txt:
	if current:
	sections.append('\n'.join(current))
	current = []
	else:
	current.append(txt)
	if current: sections.append('\n'.join(current))
	if sections: return sections, None
	return None, "DOCX 텍스트 없음"
	except Exception as e:
	return None, f"DOCX 오류: {e}"

	def extract_text_from_txt(file_path):
	"""TXT/MD/CSV 등"""
	for enc in ['utf-8', 'euc-kr', 'cp949', 'utf-16', 'latin-1']:
	try:
	with open(file_path, 'r', encoding=enc) as f:
	text = f.read()
	if text.strip():
	sections = [s.strip() for s in re.split(r'\n{2,}', text) if s.strip()]
	return sections if sections else [text], None
	except: continue
	return None, "텍스트 인코딩 실패"

	def extract_text_from_hwpx(file_path):
	"""HWPX(한글 2007 이상) → 텍스트"""
	try:
	text_parts = []
	with zipfile.ZipFile(file_path, 'r') as zf:
	file_list = zf.namelist()
	section_files = sorted([f for f in file_list if f.startswith('Contents/section') and f.endswith('.xml')])
	if not section_files:
	section_files = sorted([f for f in file_list if 'section' in f.lower() and f.endswith('.xml')])
	for sf_name in section_files:
	try:
	with zf.open(sf_name) as sf:
	content = sf.read().decode('utf-8', errors='ignore')
	content = re.sub(r'\sxmlns[^"]"[^"]"', '', content)
	content = re.sub(r'<[a-zA-Z]+:', '<', content)
	content = re.sub(r'</[a-zA-Z]+:', '</', content)
	try:
	root = ET.fromstring(content)
	texts = []
	for elem in root.iter():
	if elem.tag.endswith('t') or elem.tag == 't':
	if elem.text: texts.append(elem.text)
	elif elem.text and elem.text.strip():
	if any(x in elem.tag.lower() for x in ['text', 'run', 'para', 'char']):
	texts.append(elem.text.strip())
	if texts: text_parts.append(' '.join(texts))
	except ET.ParseError:
	matches = re.findall(r'>([^<]+)<', content)
	clean = [t.strip() for t in matches if t.strip() and len(t.strip()) > 1]
	if clean: text_parts.append(' '.join(clean))
	except: continue
	if text_parts:
	return text_parts, None
	return None, "HWPX 텍스트 없음"
	except zipfile.BadZipFile:
	return None, "유효하지 않은 HWPX"
	except Exception as e:
	return None, f"HWPX 오류: {e}"

	def _decode_hwp_para(data):
	"""HWP 문단 디코딩"""
	result = []
	i = 0
	while i < len(data) - 1:
	code = int.from_bytes(data[i:i+2], 'little')
	if code in (1,2,3): i += 14
	elif code == 9: result.append('\t')
	elif code in (10,13): result.append('\n')
	elif code == 24: result.append('-')
	elif code in (30,31): result.append(' ')
	elif code >= 32:
	try:
	ch = chr(code)
	if ch.isprintable() or ch in '\n\t ': result.append(ch)
	except: pass
	i += 2
	text = ''.join(result).strip()
	text = re.sub(r'[ \t]+', ' ', text)
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text if len(text) > 2 else None

	def _extract_hwp_section(data):
	"""HWP 섹션 추출"""
	texts = []
	pos = 0
	while pos < len(data) - 4:
	try:
	header = int.from_bytes(data[pos:pos+4], 'little')
	tag_id = header & 0x3FF
	size = (header >> 20) & 0xFFF
	pos += 4
	if size == 0xFFF:
	if pos + 4 > len(data): break
	size = int.from_bytes(data[pos:pos+4], 'little')
	pos += 4
	if pos + size > len(data): break
	record_data = data[pos:pos+size]
	pos += size
	if tag_id == 67 and size > 0:
	t = _decode_hwp_para(record_data)
	if t: texts.append(t)
	except:
	pos += 1
	return '\n'.join(texts) if texts else None

	def extract_text_from_hwp(file_path):
	"""HWP(구형) → 텍스트"""
	if not HAS_OLEFILE: return None, "olefile 없음"
	try:
	ole = olefile.OleFileIO(file_path)
	if not ole.exists('FileHeader'):
	ole.close(); return None, "HWP 헤더 없음"
	header_data = ole.openstream('FileHeader').read()
	is_compressed = (header_data[36] & 1) == 1 if len(header_data) > 36 else True
	all_texts = []
	for entry in ole.listdir():
	entry_path = '/'.join(entry)
	if 'Section' in entry_path and entry_path.endswith('_content.xml'):
	try:
	with ole.openstream(entry) as stream:
	content = stream.read()
	if is_compressed:
	try:
	content = zlib.decompress(content, -zlib.MAX_WBITS)
	except: pass
	t = _extract_hwp_section(content)
	if t: all_texts.append(t)
	except: pass
	ole.close()
	if all_texts:
	return all_texts, None
	return None, "HWP 텍스트 없음"
	except Exception as e:
	return None, f"HWP 오류: {e}"

	def extract_file_text_api(file_obj):
	"""파일 객체 → 텍스트"""
	if not file_obj: return ""
	fp = Path(file_obj.name)
	suffix = fp.suffix.lower()
	texts = None
	error = None
	if suffix == '.pdf':
	texts, error = extract_text_from_pdf(str(fp))
	elif suffix == '.docx':
	texts, error = extract_text_from_docx(str(fp))
	elif suffix in ['.txt', '.md', '.csv']:
	texts, error = extract_text_from_txt(str(fp))
	elif suffix == '.hwpx':
	texts, error = extract_text_from_hwpx(str(fp))
	elif suffix == '.hwp':
	texts, error = extract_text_from_hwp(str(fp))
	else:
	texts, error = extract_text_from_txt(str(fp))
	if error:
	return f"⚠️ {error}"
	return '\n\n'.join(texts) if texts else "텍스트 추출 실패"

	# ============================================
	# 기본 텍스트 처리
	# ============================================

	def split_sentences(text):
	"""문장 분리"""
	text = re.sub(r'\s+', ' ', text).strip()
	sents = re.split(r'[.!?]+(?=\s\|$)', text)
	sents = [s.strip() for s in sents if s.strip()]
	return sents

	def split_words(text):
	"""단어 분리"""
	return [w for w in re.findall(r'[가-힣a-zA-Z0-9]+', text) if w]

	# ============================================
	# HTTP 헬퍼
	# ============================================

	def http_get(url, headers=None, timeout=10):
	"""HTTP GET"""
	if HAS_HTTPX:
	try:
	r = httpx.get(url, headers=headers, timeout=timeout)
	return r.text if r.status_code == 200 else None
	except: return None
	return None

	# ============================================
	# 웹 검색 함수들
	# ============================================

	def brave_search(query, count=5):
	"""Brave Search API"""
	BRAVE_KEY = os.getenv("BRAVE_API_KEY", "")
	if not BRAVE_KEY: return []
	url = f"https://api.search.brave.com/res/v1/web/search?q={query}&count={count}"
	try:
	if HAS_HTTPX:
	r = httpx.get(url, headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}, timeout=10)
	if r.status_code == 200:
	data = r.json()
	results = []
	for item in data.get("web", {}).get("results", []):
	results.append({"title": item.get("title",""), "url": item.get("url",""), "snippet": item.get("description",""), "source": "Brave"})
	return results
	except: pass
	return []

	def search_kci(query):
	"""KCI 검색"""
	try:
	url = f"https://open.kci.go.kr/po/openapi/openApiSearch.kci?apiCode=articleSearch&title={query}&displayCount=3"
	resp = http_get(url, timeout=8)
	if resp:
	results = []
	for m in re.finditer(r'<article-title><!\[CDATA\[(.+?)\]\]></article-title>.*?<url><!\[CDATA\[(.+?)\]\]></url>', resp, re.S):
	results.append({"title": m.group(1), "url": m.group(2), "snippet": "", "source": "KCI"})
	return results[:3]
	except: pass
	return []

	def search_riss(query):
	"""RISS 검색"""
	results = []
	try:
	url = f"http://www.riss.kr/search/Search.do?isDetailSearch=N&searchGubun=true&viewYn=OP&queryText=&strQuery={query}&iStartCount=0&iGroupView=5&icate=all"
	resp = http_get(url, timeout=8)
	if resp:
	for m in re.finditer(r'class="title"[^>]>.?<a[^>]href="([^"]+)"[^>]>(.*?)</a>', resp, re.S):
	title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
	if title:
	results.append({"title": title, "url": "https://www.riss.kr" + m.group(1), "snippet": "", "source": "RISS"})
	except: pass
	return results[:3]

	def search_arxiv(query):
	"""arXiv 검색"""
	results = []
	try:
	import urllib.parse
	q = urllib.parse.quote(query)
	url = f"https://export.arxiv.org/api/query?search_query=all:{q}&start=0&max_results=3&sortBy=relevance"
	resp = http_get(url, timeout=12)
	if resp:
	for m in re.finditer(r'<entry>.?<title>(.?)</title>.?<id>(.?)</id>.?<summary>(.?)</summary>', resp, re.S):
	title = re.sub(r'\s+', ' ', m.group(1)).strip()
	results.append({"title": title, "url": m.group(2).strip(), "snippet": re.sub(r'\s+', ' ', m.group(3)).strip()[:150], "source": "arXiv"})
	except: pass
	return results[:3]

	def duckduckgo_search(query, max_results=5):
	"""DuckDuckGo 검색"""
	results = []
	try:
	import urllib.parse
	q = urllib.parse.quote(query)
	url = f"https://html.duckduckgo.com/html/?q={q}"
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
	resp = http_get(url, headers=headers, timeout=10)
	if resp:
	for m in re.finditer(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]>(.?)</a>.?<a[^>]+class="result__snippet"[^>]>(.*?)</a>', resp, re.S):
	href = m.group(1)
	title = re.sub(r'<[^>]+>', '', m.group(2)).strip()
	snippet = re.sub(r'<[^>]+>', '', m.group(3)).strip()
	real_url = href
	if 'uddg=' in href:
	um = re.search(r'uddg=([^&]+)', href)
	if um: real_url = urllib.parse.unquote(um.group(1))
	if title:
	results.append({"title": title, "url": real_url, "snippet": snippet, "source": "Web"})
	if len(results) >= max_results: break
	except: pass
	return results

	def self_crawl_search(query, max_results=3):
	"""DuckDuckGo 크롤링"""
	all_results = []
	all_results.extend(duckduckgo_search(query, max_results))
	if '논문' not in query and 'paper' not in query.lower():
	all_results.extend(duckduckgo_search(f"{query} 논문 학술", 2))
	return all_results

	def parallel_brave_search(queries, max_workers=10):
	"""Brave Search 병렬 실행"""
	all_results = {}
	with ThreadPoolExecutor(max_workers=min(max_workers, 20)) as executor:
	futures = {executor.submit(brave_search, q, 3): q for q in queries}
	for future in as_completed(futures):
	q = futures[future]
	try:
	results = future.result()
	all_results[q] = results
	except: all_results[q] = []
	return all_results