|
from fastapi import FastAPI, Request |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline |
|
import torch |
|
from pydantic import BaseModel |
|
from typing import Optional |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import yake |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
class InputText(BaseModel): |
|
text : str |
|
threshold: float = 0. |
|
|
|
|
|
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment" |
|
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
sentiment_model.config.id2label[3] = "mixed" |
|
|
|
model_name = 'qanastek/51-languages-classifier' |
|
language_model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
language_tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
language = "id" |
|
max_ngram_size = 3 |
|
deduplication_threshold = 0.6 |
|
deduplication_algo = 'seqm' |
|
windowSize = 3 |
|
numOfKeywords = 20 |
|
|
|
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/") |
|
def greet_json(): |
|
return {"Hello": "World!"} |
|
|
|
|
|
@app.post("/key_phrase_extraction") |
|
async def key_phrase_extraction(inp:InputText): |
|
def merge_keyphrases(keyphrases): |
|
new_merged = keyphrases |
|
while True: |
|
merged = [new_merged[0]] |
|
for i in range(1, len(keyphrases)): |
|
keys = keyphrases[i] |
|
keys_prev = keyphrases[i-1] |
|
label = keys["label"] |
|
score = keys["score"] |
|
vectorizer = CountVectorizer(ngram_range=( 1,len(label.split(" ")) ), lowercase=False) |
|
analyzer = vectorizer.build_analyzer() |
|
for key in analyzer(label)[::-1]: |
|
key_prev = keys_prev["label"][::-1] |
|
if key == key_prev[:len(key)][::-1].strip(): |
|
label = key_prev[len(key):][::-1].strip() + " " + label |
|
score = max(keys_prev["score"],keys["score"]) |
|
merged.pop() |
|
break |
|
merged.append({"label":label.strip(), "score":score}) |
|
if new_merged == merged: |
|
break |
|
else: |
|
new_merged = merged |
|
return merged |
|
|
|
keywords = kw_extractor.extract_keywords(inp.text) |
|
|
|
return merge_keyphrases([{"label":key[0], "score":1-key[1]} for key in keywords if 1-key[1]>inp.threshold]) |
|
|
|
|
|
@app.post("/language_detection") |
|
async def language_detection(inp: InputText): |
|
inputs = language_tokenizer(inp.text, return_tensors='pt') |
|
with torch.no_grad(): |
|
logits = language_model(**inputs).logits |
|
|
|
softmax = torch.nn.functional.sigmoid(logits) |
|
|
|
|
|
mask = softmax >= inp.threshold |
|
|
|
|
|
filtered_x = softmax[mask] |
|
|
|
|
|
sorted_indices = torch.argsort(filtered_x, descending=True) |
|
|
|
|
|
original_indices = torch.nonzero(mask, as_tuple=True)[1][sorted_indices] |
|
|
|
return [{"label":language_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in original_indices] |
|
|
|
|
|
@app.post("/sentiment_score") |
|
async def sentiment_score(inp: InputText): |
|
text = inp.text |
|
inputs = sentiment_tokenizer(text[:2500], return_tensors='pt') |
|
|
|
with torch.no_grad(): |
|
logits = sentiment_model(**inputs).logits |
|
|
|
|
|
print(logits) |
|
|
|
logits = logits + logits[0,1].abs() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logits = torch.cat( |
|
( |
|
logits, ( |
|
|
|
|
|
(1 - torch.abs(logits[0,0] - logits[0,-1])*(2+(logits[0,1]//torch.max(torch.abs(logits[0,::2]))))) |
|
).unsqueeze(0).unsqueeze(0) |
|
), dim=-1 |
|
) |
|
|
|
softmax = torch.nn.functional.softmax( |
|
logits, |
|
dim=-1 |
|
) |
|
|
|
return [{"label":sentiment_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in softmax.argsort(dim=-1, descending=True)[0]] |
|
|
|
|
|
|