test / nlp_transform.py
Kaibo93's picture
Upload 7 files
119f7b3 verified
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re
model_name = "ckiplab/bert-base-chinese-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def extract_conditions(text):
print("🧠 開始分析文字內容...")
entities = ner(text)
print("🧠 模型辨識到:", entities)
result = {}
merged_text = text.replace(" ", "")
for i, e in enumerate(entities):
word = e["word"]
label = e["entity_group"]
start = e["start"]
end = e["end"]
# 數字擷取
nums = re.findall(r"\d+", word)
if not nums:
continue
num = int(nums[0])
# 百分比類型(建蔽率/容積率)
if "%" in word or label == "PERCENT":
# 找出目前百分比的實際位置
percent_pos = start
context = merged_text[max(0, start-10):start]
# 往回搜尋最近出現的關鍵詞
context_text = merged_text[:percent_pos]
nearest = ""
if "容積獎勵" in context_text:
nearest = "容積獎勵"
elif "容積率" in context_text:
nearest = "容積率"
elif "建蔽率" in context_text:
nearest = "建蔽率"
if nearest == "建蔽率":
print(f"🏗️ 建蔽率:{num}%")
result["BCR"] = num
elif nearest == "容積率":
print(f"🏗️ 容積率:{num}%")
result["FAR"] = num
elif nearest == "容積獎勵":
print(f"🎁 容積獎勵:{num}%")
result["bonus_far"] = num
# 面積類型
elif label == "QUANTITY":
# 將拆開的單位合併
next_token = entities[i+1]["word"] if i + 1 < len(entities) else ""
suffix_candidate = (word + next_token).replace(" ", "").replace("尺", "尺")
unit_keywords = ["坪", "平方公尺", "㎡", "m2", "m²"]
if any(u in suffix_candidate for u in unit_keywords):
if "坪" in suffix_candidate:
sqm = round(num * 3.3058)
print(f"📏 偵測到 {num} 坪 → {sqm} m²")
result["site_area"] = sqm
else:
print(f"📏 偵測到 {num} 平方公尺")
result["site_area"] = num
print("🧠 NLP 萃取結果:", result)
return result