File size: 2,770 Bytes
893dac4
842e849
 
 
 
 
 
 
 
 
 
893dac4
842e849
 
 
 
 
 
 
 
 
 
893dac4
842e849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import argparse
import json
import logging
import os
import sys
import pathlib
import random
import shutil
import time
from typing import Any, Dict, List, Union

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, HashingVectorizer,
                                             TfidfVectorizer)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from lr.hyperparameters import SEARCH_SPACE, RandomSearch, HyperparameterSearch
from shutil import rmtree


def load_model(serialization_dir):
    with open(os.path.join(serialization_dir, "best_hyperparameters.json"), 'r') as f:
        hyperparameters = json.load(f)
    if hyperparameters.pop('stopwords') == 1:
        stop_words = 'english'
    else:
        stop_words = None
    weight = hyperparameters.pop('weight')
    if weight == 'binary':
        binary = True
    else:
        binary = False
    ngram_range = hyperparameters.pop('ngram_range')
    ngram_range = sorted([int(x) for x in ngram_range.split()])
    if weight == 'tf-idf':
        vect = TfidfVectorizer(stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    elif weight == 'hash':
        vect = HashingVectorizer(stop_words=stop_words,lowercase=True,ngram_range=ngram_range)
    else:
        vect = CountVectorizer(binary=binary,
                               stop_words=stop_words,
                               lowercase=True,
                               ngram_range=ngram_range)
    if weight != "hash":
        with open(os.path.join(serialization_dir, "vocab.json"), 'r') as f:
            vocab = json.load(f)
        vect.vocabulary_ = vocab
    hyperparameters['C'] = float(hyperparameters['C'])
    hyperparameters['tol'] = float(hyperparameters['tol'])
    classifier = LogisticRegression(**hyperparameters)
    if os.path.exists(os.path.join(serialization_dir, "archive", "idf.npy")):
        vect.idf_ = np.load(os.path.join(serialization_dir,  "archive", "idf.npy"))
    classifier.coef_ = np.load(os.path.join(serialization_dir,  "archive", "coef.npy"))
    classifier.intercept_ = np.load(os.path.join(serialization_dir,  "archive", "intercept.npy"))
    classifier.classes_ = np.load(os.path.join(serialization_dir,  "archive", "classes.npy"))
    return classifier, vect

def score(x, clf, vectorizer):
    # score a single document
    return clf.predict_proba(vectorizer.transform([x]))

clf, vectorizer = load_model("model/")

def start(text):
    k = round(score(text, clf, vectorizer)[0][1], 2)
    return {"GPT-3 Filter Quality Score": k }