jonathanjordan21's picture
Update app.py
2a52076 verified
raw
history blame
4.66 kB
from fastapi import FastAPI, Request
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch
from pydantic import BaseModel
from typing import Optional
from sklearn.feature_extraction.text import CountVectorizer
import yake
app = FastAPI()
class InputText(BaseModel):
text : str
threshold: float = 0.
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_model.config.id2label[3] = "mixed"
model_name = 'qanastek/51-languages-classifier'
language_model = AutoModelForSequenceClassification.from_pretrained(model_name)
language_tokenizer = AutoTokenizer.from_pretrained(model_name)
language = "id"
max_ngram_size = 3
deduplication_threshold = 0.6
deduplication_algo = 'seqm'
windowSize = 3
numOfKeywords = 20
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.post("/key_phrase_extraction")
async def key_phrase_extraction(inp:InputText):
def merge_keyphrases(keyphrases):
new_merged = keyphrases
while True:
merged = [new_merged[0]]
for i in range(1, len(keyphrases)):
keys = keyphrases[i]
keys_prev = keyphrases[i-1]
label = keys["label"]
score = keys["score"]
vectorizer = CountVectorizer(ngram_range=( 1,len(label.split(" ")) ), lowercase=False)
analyzer = vectorizer.build_analyzer()
for key in analyzer(label)[::-1]:
key_prev = keys_prev["label"][::-1]
if key == key_prev[:len(key)][::-1].strip():
label = key_prev[len(key):][::-1].strip() + " " + label#.replace(key, "")
score = max(keys_prev["score"],keys["score"])
merged.pop()
break
merged.append({"label":label.strip(), "score":score})
if new_merged == merged:
break
else:
new_merged = merged
return merged
keywords = kw_extractor.extract_keywords(inp.text)
return merge_keyphrases([{"label":key[0], "score":1-key[1]} for key in keywords if 1-key[1]>inp.threshold])
@app.post("/language_detection")
async def language_detection(inp: InputText):
inputs = language_tokenizer(inp.text, return_tensors='pt')
with torch.no_grad():
logits = language_model(**inputs).logits
softmax = torch.nn.functional.sigmoid(logits)
# Apply the threshold by creating a mask
mask = softmax >= inp.threshold
# Filter the tensor based on the threshold
filtered_x = softmax[mask]
# Get the sorted indices of the filtered tensor
sorted_indices = torch.argsort(filtered_x, descending=True)
# Map the sorted indices back to the original tensor indices
original_indices = torch.nonzero(mask, as_tuple=True)[1][sorted_indices]
return [{"label":language_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in original_indices]
@app.post("/sentiment_score")
async def sentiment_score(inp: InputText):
text = inp.text
inputs = sentiment_tokenizer(text[:2500], return_tensors='pt')
with torch.no_grad():
logits = sentiment_model(**inputs).logits #+ 1
print(logits)
logits = logits + logits[0,1].abs()
# print(torch.nn.functional.sigmoid(logits))
# logits = logits / 10
# print(logits)
# print(torch.abs(logits[0,0] - logits[0,-1]))
# print(logits[0,1]//torch.max(torch.abs(logits[0,::2])))
logits = torch.cat(
(
logits, (
# ( logits[0,1] + torch.sign(logits[0,0] - logits[0,-1]) * (logits[0,0] - logits[0,-1])/2 )/2 +
# (logits[0,0] + logits[0,-1])/20
(1 - torch.abs(logits[0,0] - logits[0,-1])*(2+(logits[0,1]//torch.max(torch.abs(logits[0,::2])))))
).unsqueeze(0).unsqueeze(0)
), dim=-1
)
softmax = torch.nn.functional.softmax(
logits,
dim=-1
)
return [{"label":sentiment_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in softmax.argsort(dim=-1, descending=True)[0]]