Spaces:
Sleeping
Sleeping
| """ | |
| LIFT UP Taksonomi SΔ±nΔ±flandΔ±rΔ±cΔ± β Backend API | |
| Hugging Face Spaces (Docker) ΓΌzerinde Γ§alΔ±ΕΔ±r. | |
| Endpoint: | |
| POST /classify | |
| { | |
| "baslik": "Proje baΕlΔ±ΔΔ±", | |
| "ozet": "Proje ΓΆzeti", | |
| "keywords": ["opsiyonel", "liste"] # opsiyonel | |
| } | |
| β | |
| { | |
| "prediction": "Kompozit YapΔ±lar", | |
| "confidence": 0.82, | |
| "top_3": [...], | |
| "extracted_keywords": [...], | |
| "processing_time_ms": 1240 | |
| } | |
| """ | |
| import os | |
| import re | |
| import time | |
| import unicodedata | |
| import logging | |
| from contextlib import asynccontextmanager | |
| from collections import Counter | |
| from dataclasses import dataclass, field | |
| from typing import Dict, List, Optional, Set | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM | |
| from sentence_transformers import SentenceTransformer | |
| from keybert import KeyBERT | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| logging.basicConfig(level=logging.INFO) | |
| log = logging.getLogger("liftup") | |
| HF_USERNAME = os.getenv("HF_USERNAME", "Engin34") | |
| HF_TOKEN = os.getenv("HF_TOKEN", "") # Space secret olarak eklenecek | |
| # βββ Global model deΔiΕkenleri βββββββββββββββββββββββββββββββββββββββ | |
| bert_model = None | |
| bert_tok = None | |
| kw_model = None | |
| generator = None | |
| clf = None | |
| TOP_KEYWORDS = None | |
| device = torch.device("cpu") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAKSONOMΔ° PARSER | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _temizle(k): | |
| k = k.replace('\u200b','').replace('\ufeff','') | |
| k = unicodedata.normalize('NFKC', k) | |
| return re.sub(r'\s+',' ', k).strip().lower() | |
| def _parantez_ayir(k): | |
| m = re.match(r'^(.+?)\s*\((.+?)\)\s*$', k) | |
| if not m: return [k] | |
| ana, ic = m.group(1).strip(), m.group(2).strip() | |
| if any(a in ic.lower() for a in ['baΔlam','kΔ±smΔ±','tarafΔ±','proses','analiz','anahtarlarΔ±','servisleme']): | |
| return [ana] | |
| return [ana] + [p.strip() for p in ic.split('/') if p.strip()] | |
| def _virgul_ayir(metin): | |
| sonuc, buf, d = [], [], 0 | |
| for c in metin: | |
| if c == '(': d += 1; buf.append(c) | |
| elif c == ')': d -= 1; buf.append(c) | |
| elif c == ',' and d == 0: sonuc.append(''.join(buf)); buf = [] | |
| else: buf.append(c) | |
| if buf: sonuc.append(''.join(buf)) | |
| return sonuc | |
| def parse_taksonomi(icerik: str) -> Dict: | |
| icerik = icerik.replace('\u200b','') | |
| matches = list(re.finditer(r'^\s*(\d+)\)\s+(.+?)\s*$', icerik, re.MULTILINE)) | |
| tax = {} | |
| for i, m in enumerate(matches): | |
| kat = m.group(2).strip() | |
| govde = icerik[m.end():(matches[i+1].start() if i+1 < len(matches) else len(icerik))].strip() | |
| pm = re.search(r'\((.+)\)', govde, re.DOTALL) | |
| if not pm: continue | |
| kw_set = set() | |
| for parca in _virgul_ayir(pm.group(1)): | |
| for alt in _parantez_ayir(parca.strip()): | |
| for k in re.split(r'[/]', alt): | |
| temiz = _temizle(k) | |
| if len(temiz) >= 2: kw_set.add(temiz) | |
| tax[kat] = {'keywords': kw_set} | |
| return tax | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HΔ°BRΔ°T SINIFLANDIRICI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class EslesmeBilgisi: | |
| keyword: str; eslesme_tipi: str; eslesen_taksonomi_kw: str; puan: float | |
| class KategoriSkoru: | |
| kategori: str; final_skor: float; keyword_skor: float; semantic_skor: float | |
| eslesmeler: list = field(default_factory=list) | |
| class HibritSiniflandirici: | |
| def __init__(self, taxonomy, embedder, keyword_weight=0.4, semantic_weight=0.6): | |
| self.taxonomy = {c: {'keywords':{str(k).lower().strip() for k in d.get('keywords',set()) if str(k).strip()}} | |
| for c,d in taxonomy.items()} | |
| self.kw_w, self.sem_w = keyword_weight, semantic_weight | |
| self.embedder = embedder | |
| log.info("Centroid'ler hesaplanΔ±yor...") | |
| self.centroids = self._centroids() | |
| self.idf = self._idf() | |
| log.info(f"HazΔ±r: {len(self.taxonomy)} kategori") | |
| def _centroids(self): | |
| c = {} | |
| for cat, d in self.taxonomy.items(): | |
| kws = list(d['keywords']) | |
| if not kws: c[cat]=None; continue | |
| embs = self.embedder.encode(kws, show_progress_bar=False, convert_to_numpy=True) | |
| v = np.mean(embs, axis=0); n = np.linalg.norm(v) | |
| c[cat] = v/n if n>0 else v | |
| return c | |
| def _idf(self): | |
| cnt = Counter() | |
| for d in self.taxonomy.values(): | |
| for k in d['keywords']: cnt[k]+=1 | |
| N = len(self.taxonomy) | |
| return {k: np.log(N/v)+1.0 for k,v in cnt.items()} | |
| def _kw_score(self, extracted): | |
| ext = [k.lower().strip() for k in extracted if k and str(k).strip()] | |
| max_idf = max(self.idf.values(), default=1.0) | |
| results = {} | |
| for cat, d in self.taxonomy.items(): | |
| cat_kws = d['keywords']; score, eslm = 0.0, [] | |
| for kw in ext: | |
| idf_w = self.idf.get(kw, 1.0) | |
| if kw in cat_kws: | |
| p=2.0*idf_w; score+=p; eslm.append(EslesmeBilgisi(kw,'exact',kw,p)); continue | |
| if len(kw)<4: continue | |
| for ck in cat_kws: | |
| if len(ck)>=4 and (kw in ck or ck in kw): | |
| p=1.0*idf_w; score+=p; eslm.append(EslesmeBilgisi(kw,'partial',ck,p)); break | |
| max_p = max(len(ext)*2.0*max_idf, 1e-6) | |
| results[cat] = (min(score/max_p,1.0), eslm) | |
| return results | |
| def _sem_score(self, extracted, text=None): | |
| parts = [] | |
| if text and str(text).strip(): parts.append(str(text).strip()) | |
| if extracted: parts.append(" ".join(extracted)) | |
| if not parts: return {c:0.0 for c in self.taxonomy} | |
| emb = self.embedder.encode([" | ".join(parts)], show_progress_bar=False, convert_to_numpy=True)[0] | |
| n = np.linalg.norm(emb) | |
| if n>0: emb=emb/n | |
| return {c: max(0.0, min(1.0,(float(np.dot(emb,cn))+1.0)/2.0)) if cn is not None else 0.0 | |
| for c,cn in self.centroids.items()} | |
| def classify(self, keywords, text=None, top_k=3): | |
| kw_r = self._kw_score(keywords) | |
| sem_s = self._sem_score(keywords, text) | |
| ks = {} | |
| for c in self.taxonomy: | |
| kwn, esl = kw_r[c] | |
| f = self.kw_w*kwn + self.sem_w*sem_s[c] | |
| ks[c] = KategoriSkoru(c, f, kwn, sem_s[c], esl) | |
| srt = sorted(ks.values(), key=lambda x: x.final_skor, reverse=True) | |
| return {'prediction': srt[0].kategori, 'confidence': srt[0].final_skor, 'top_k': srt[:top_k]} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BERT MODEL | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class LiftUpBertModel(nn.Module): | |
| def __init__(self, num_labels=128): | |
| super().__init__() | |
| self.bert = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased") | |
| self.dropout = nn.Dropout(0.3) | |
| self.classifier = nn.Linear(768, num_labels) | |
| def forward(self, input_ids, attention_mask): | |
| out = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
| return self.classifier(self.dropout(out.last_hidden_state[:,0])) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # POST-PROCESSOR | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SoftPostProcessor: | |
| def __init__(self): | |
| self.blacklist = {'kombinatΓΌr','hesonomik','modΓΌlasyonlarΔ±','difΓΌzΓΆrlΓΌ','optimizasyonlarΔ±nΔ±'} | |
| self.acronyms = {'CFD','FEA','CAD','ROS','CNN','AI','ML','DL','IoT','GPU','SSD'} | |
| def is_acronym(self,w): return w.isupper() and 2<=len(w)<=5 | |
| def fix_case(self,kw): | |
| out=[] | |
| for w in kw.split(): | |
| if w.upper() in self.acronyms or self.is_acronym(w): out.append(w.upper()) | |
| elif not out: out.append(w.capitalize()) | |
| else: out.append(w.lower()) | |
| return ' '.join(out) | |
| def should_filter(self,kw): | |
| if kw.lower() in self.blacklist: return True | |
| if not(3<=len(kw)<=80): return True | |
| if re.search(r'[^a-zA-ZΓ§ΓΔΔΔ±Δ°ΓΆΓΕΕΓΌΓ\s\-]',kw): return True | |
| return False | |
| def process(self,keywords,min_kw=3): | |
| processed=[] | |
| for kw in keywords: | |
| if self.should_filter(kw): continue | |
| fixed=self.fix_case(kw) | |
| if not any(p.lower()==fixed.lower() for p in processed): processed.append(fixed) | |
| return processed[:8] if processed else keywords[:3] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MODEL YΓKLEME (startup) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_models(): | |
| global bert_model, bert_tok, kw_model, generator, clf, TOP_KEYWORDS | |
| auth = {"token": HF_TOKEN} if HF_TOKEN else {} | |
| log.info("Modeller yΓΌkleniyor...") | |
| # 1) taksonomi | |
| log.info("Taksonomi indiriliyor...") | |
| tax_path = hf_hub_download( | |
| repo_id=f"{HF_USERNAME}/liftup-bert", | |
| filename="taksonomi.txt", **auth | |
| ) | |
| with open(tax_path, encoding='utf-8') as f: | |
| taxonomy = parse_taksonomi(f.read()) | |
| # 2) BERT checkpoint (TOP_KEYWORDS iΓ§in) | |
| log.info("BERT checkpoint indiriliyor...") | |
| ckpt_path = hf_hub_download( | |
| repo_id=f"{HF_USERNAME}/liftup-bert", | |
| filename="checkpoint.pth", **auth | |
| ) | |
| ckpt = torch.load(ckpt_path, map_location="cpu") | |
| TOP_KEYWORDS = ckpt["TOP_KEYWORDS"] | |
| # 3) BERT model aΔΔ±rlΔ±klarΔ± | |
| log.info("BERT model aΔΔ±rlΔ±klarΔ± indiriliyor (422 MB)...") | |
| bert_path = hf_hub_download( | |
| repo_id=f"{HF_USERNAME}/liftup-bert", | |
| filename="best_bert_model.pth", **auth | |
| ) | |
| bert_tok = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased") | |
| model = LiftUpBertModel(len(TOP_KEYWORDS)) | |
| model.load_state_dict(torch.load(bert_path, map_location="cpu")) | |
| model.eval() | |
| bert_model = model | |
| # 4) KeyBERT (aynΔ± zamanda hibrit sΔ±nΔ±flandΔ±rΔ±cΔ±nΔ±n embedder'Δ±) | |
| log.info("KeyBERT yΓΌkleniyor...") | |
| kw_model = KeyBERT(model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
| embedder = kw_model.model.embedding_model | |
| # 5) ByT5 | |
| log.info("ByT5 indiriliyor (1.1 GB)...") | |
| byt5_dir = snapshot_download( | |
| repo_id=f"{HF_USERNAME}/liftup-byt5", **auth | |
| ) | |
| byt5_tok = AutoTokenizer.from_pretrained("google/byt5-small") | |
| byt5_mdl = AutoModelForSeq2SeqLM.from_pretrained(byt5_dir) | |
| byt5_mdl.eval() | |
| post = SoftPostProcessor() | |
| class Generator: | |
| def __init__(self, tok, mdl, pp): | |
| self.tok, self.mdl, self.pp = tok, mdl, pp | |
| def generate(self, title="", abstract=""): | |
| text = f"keywords: {title} {abstract}".strip() | |
| inp = self.tok(text, max_length=512, truncation=True, return_tensors="pt") | |
| with torch.no_grad(): | |
| out = self.mdl.generate(**inp, max_new_tokens=128, do_sample=False, | |
| no_repeat_ngram_size=4, repetition_penalty=1.5) | |
| pred = self.tok.decode(out[0], skip_special_tokens=True) | |
| if pred.lower().startswith("keywords:"): pred=pred[9:].strip() | |
| kws = [k.strip() for k in pred.split(';') if k.strip()] | |
| return self.pp.process(kws) | |
| generator = Generator(byt5_tok, byt5_mdl, post) | |
| # 6) Hibrit sΔ±nΔ±flandΔ±rΔ±cΔ± | |
| log.info("Hibrit sΔ±nΔ±flandΔ±rΔ±cΔ± baΕlatΔ±lΔ±yor...") | |
| clf = HibritSiniflandirici(taxonomy, embedder) | |
| log.info("β TΓΌm modeller hazΔ±r!") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FASTAPI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def lifespan(app: FastAPI): | |
| load_models() | |
| yield | |
| app = FastAPI(title="LIFT UP SΔ±nΔ±flandΔ±rΔ±cΔ±", lifespan=lifespan) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["POST","GET"], | |
| allow_headers=["*"], | |
| ) | |
| class ClassifyRequest(BaseModel): | |
| baslik: str | |
| ozet: str | |
| keywords: Optional[List[str]] = None | |
| class KategoriResponse(BaseModel): | |
| kategori: str | |
| guven: float | |
| keyword_skor: float | |
| semantic_skor: float | |
| eslesmeler: List[str] | |
| class ClassifyResponse(BaseModel): | |
| prediction: str | |
| confidence: float | |
| top_3: List[KategoriResponse] | |
| extracted_keywords: List[str] | |
| processing_time_ms: int | |
| def bert_extract(text): | |
| enc = bert_tok(str(text).lower(), truncation=True, padding='max_length', | |
| max_length=256, return_tensors='pt') | |
| with torch.no_grad(): | |
| logits = bert_model(enc['input_ids'], enc['attention_mask']) | |
| probs = torch.sigmoid(logits)[0].numpy() | |
| idxs = np.argsort(probs)[-10:][::-1] | |
| return [TOP_KEYWORDS[i] for i in idxs if probs[i]>0.01][:5] | |
| def keybert_extract(text): | |
| clean = re.sub(r'[^\w\sΔΓΌΕΔ±ΓΆΓ§ΔΓΕΔ°ΓΓ]',' ', text.lower()).strip() | |
| try: | |
| kws = kw_model.extract_keywords(clean, keyphrase_ngram_range=(1,3), | |
| top_n=5, use_mmr=True, diversity=0.2) | |
| return [k[0] for k in kws][:3] | |
| except: | |
| return [] | |
| def health(): return {"status": "ok"} | |
| def root(): return {"message": "LIFT UP API Γ§alΔ±ΕΔ±yor", "endpoint": "POST /classify"} | |
| def classify(req: ClassifyRequest): | |
| if not req.baslik.strip() or not req.ozet.strip(): | |
| raise HTTPException(400, "BaΕlΔ±k ve ΓΆzet zorunludur") | |
| t0 = time.time() | |
| text = f"{req.baslik} {req.ozet}" | |
| # Keyword extraction | |
| bert_kws = bert_extract(text) | |
| kb_kws = keybert_extract(text) | |
| byt5_kws = generator.generate(req.baslik, req.ozet) | |
| # KullanΔ±cΔ± keyword'leri varsa ekle | |
| extra = req.keywords or [] | |
| tum_kws = list(dict.fromkeys(bert_kws + kb_kws + byt5_kws + extra)) | |
| # SΔ±nΔ±flandΔ±rma | |
| sonuc = clf.classify(tum_kws, text, top_k=3) | |
| ms = int((time.time()-t0)*1000) | |
| return ClassifyResponse( | |
| prediction=sonuc['prediction'], | |
| confidence=round(sonuc['confidence'], 4), | |
| top_3=[ | |
| KategoriResponse( | |
| kategori=ks.kategori, | |
| guven=round(ks.final_skor, 4), | |
| keyword_skor=round(ks.keyword_skor, 4), | |
| semantic_skor=round(ks.semantic_skor, 4), | |
| eslesmeler=[e.keyword for e in ks.eslesmeler], | |
| ) | |
| for ks in sonuc['top_k'] | |
| ], | |
| extracted_keywords=tum_kws, | |
| processing_time_ms=ms, | |
| ) | |