File size: 11,995 Bytes
286ab7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66e487f
286ab7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186d8ec
286ab7c
 
 
186d8ec
 
286ab7c
186d8ec
286ab7c
 
 
d472283
286ab7c
 
 
 
 
 
 
d472283
 
 
 
 
4a5c5be
286ab7c
d472283
286ab7c
 
 
 
 
bc866db
 
d472283
 
 
 
 
 
286ab7c
 
 
 
 
 
ec33221
286ab7c
 
 
 
f557f6e
 
 
 
 
 
 
 
 
 
 
 
 
 
286ab7c
 
 
4a5c5be
286ab7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9587c22
286ab7c
 
 
 
 
 
 
 
186d8ec
286ab7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9587c22
286ab7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def main():
    """
    Creates a Streamlit web app that classifies a given body of text as either human-made or AI-generated,
    using a pre-trained model. 
    """
    # Import libraries
    import streamlit as st
    import numpy as np
    import joblib
    import string
    import time
    import scipy
    import spacy
    import re
    from transformers import AutoTokenizer
    import torch
    from eli5.lime import TextExplainer
    from eli5.lime.samplers import MaskingTextSampler
    import eli5
    import shap
    from custom_models import HF_DistilBertBasedModelAppDocs, HF_BertBasedModelAppDocs

    # Initialize Spacy
    nlp = spacy.load("en_core_web_sm")
    
    # device to run DL model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def format_text(text: str) -> str:
        """
        This function takes a string as input and returns a formatted version of the string. 
        The function replaces specific substrings in the input string with empty strings, 
        converts the string to lowercase, removes any leading or trailing whitespace, 
        and removes any punctuation from the string. 
        """

        text = nlp(text)
        text = " ".join([token.text for token in text if token.ent_type_ not in ["PERSON", "DATE"]])

        
        return text.replace("REDACTED", "").lower().replace("  "," ").replace("[Name]", "").replace("[your name]", "").replace("\n your name", "").\
                                replace("dear admissions committee,", "").replace("sincerely,","").\
                                replace("[university's name]","").replace("dear sir/madam,","").\
                                replace("โ€“ statement of intent  ","").\
                                replace('program: master of science in data analytics  name of applicant:    ',"").\
                                replace("data analytics", "data science").replace("| \u200b","").\
                                replace("m.s. in data science at lincoln center  ","").\
                                translate(str.maketrans('', '', string.punctuation)).strip().lstrip()

    # Define the function to classify text
    def nb_lr(model, text):
        # Clean and format the input text
        text = format_text(text)
        # Predict using either LR or NB and get prediction probability
        prediction = model.predict([text]).item()
        predict_proba = round(model.predict_proba([text]).squeeze()[prediction].item(),4)
        return prediction, predict_proba
    
    def torch_pred(tokenizer, model, text):
        # DL models (BERT/DistilBERT based models)
        cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True)
        with torch.inference_mode():
            text = format_text(text)
            input_ids, att = cleaned_text_tokens["input_ids"], cleaned_text_tokens["attention_mask"]
            input_ids = torch.tensor(input_ids).to(device)
            attention_mask = torch.tensor(att).to(device)
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            _, prediction = torch.max(logits, 1)
            prediction = prediction.item()
            predict_proba = round(torch.softmax(logits, 1).cpu().squeeze().tolist()[prediction],4)
            return prediction, predict_proba

    def pred_str(prediction, option):
    # Map the predicted class to string output
        if prediction == 0:
            return "Human-made ๐Ÿคทโ€โ™‚๏ธ๐Ÿคทโ€โ™€๏ธ"
        elif prediction == 1 and "Revised" in option:
            return "Revised with AI ๐Ÿฆพ"
        else:
            return "Generated with AI ๐Ÿฆพ"
    
    @st.cache(allow_output_mutation=True, suppress_st_warning=True)
    def load_tokenizer(option):
        if option in ("BERT Generated", "BERT Revised"):
            tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True)
        else:
            tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", padding='max_length', max_length=512, truncation=True)
        return tokenizer

    @st.cache(allow_output_mutation=True, suppress_st_warning=True)
    def load_model(option):
        if option == "BERT Generated":
            model = HF_BertBasedModelAppDocs.from_pretrained("GradApplicationDocuments/BERTGradGen").to(device)
        elif option == "BERT Revised":
            model = HF_BertBasedModelAppDocs.from_pretrained("GradApplicationDocuments/BERTGradRevised").to(device)
        elif option == "D-BERT Generated":
            model = HF_DistilBertBasedModelAppDocs.from_pretrained("GradApplicationDocuments/DistilBERTGradGen").to(device)
        else:
            model = HF_DistilBertBasedModelAppDocs.from_pretrained("GradApplicationDocuments/DistilBERTGradRevised").to(device)
        return model
    

    # Streamlit app:

    models_available = {"LR Generated":"models/baseline_model_lr_generated.joblib", 
                        "NB Generated": "models/baseline_model_nb_generated.joblib",
                        "D-BERT Generated": "GradApplicationDocuments/DistilBERTGradGen",
                        "BERT Generated": "GradApplicationDocuments/BERTGradGen",
                        "LR Revised":"models/baseline_model_lr_revised.joblib", 
                        "NB Revised": "models/baseline_model_nb_revised.joblib",
                        "D-BERT Revised": "GradApplicationDocuments/DistilBERTGradRevised",
                        "BERT Revised": "GradApplicationDocuments/BERTGradRevised",
                        }

    st.set_page_config(page_title="AI/Human GradAppDocs", page_icon="๐Ÿค–", layout="wide")
    st.title("Academic Application Document Classifier")
    st.header("Is it human-made ๐Ÿ“ or Enhanced with AI ๐Ÿค– ?  ")
    
    st.markdown('AI-generated content has reached an unprecedented level of realism. The models on this website focus on identifying AI-enhanced and AI-generated application materials, such as Statements of Intent (SOI) and Letters of Recommendation (LOR). These models were trained using real-world SOIs and LORs, alongside a revised version and a generated version of each that has been created through AI.')    
    # Check the model to use
    def restore_prediction_state():
        if "prediction" in st.session_state:
            del st.session_state.prediction
    
    option_ai = st.selectbox("Generated/Revised", ["Generated", "Revised"], on_change=restore_prediction_state)
    if option_ai == "Generated":
        option = st.selectbox("Select a model to use:", {"LR Generated":"models/baseline_model_lr_generated.joblib", 
                                                         "NB Generated": "models/baseline_model_nb_generated.joblib",
                                                         "D-BERT Generated": "GradApplicationDocuments/DistilBERTGradGen",
                                                         "BERT Generated": "GradApplicationDocuments/BERTGradGen"}, 
                              on_change=restore_prediction_state)
    elif option_ai == "Revised":
        option = st.selectbox("Select a model to use:", {"LR Revised":"models/baseline_model_lr_revised.joblib", 
                                                         "NB Revised": "models/baseline_model_nb_revised.joblib",
                                                         "D-BERT Revised": "GradApplicationDocuments/DistilBERTGradRevised",
                                                         "BERT Revised": "GradApplicationDocuments/BERTGradRevised"}, 
                              on_change=restore_prediction_state)
    

    # Load the selected trained model
    if option in ("BERT Generated", "BERT Revised", "D-BERT Generated", "D-BERT Revised"):
        tokenizer = load_tokenizer(option)
        model = load_model(option)
    else:
        model = joblib.load(models_available[option])


    text = st.text_area("Enter either a statement of intent or a letter of recommendation:")

    #Hide footer "made with streamlit"
    hide_st_style = """
            <style>
            footer {visibility: hidden;}
            header {visibility: hidden;}
            </style>
            """
    st.markdown(hide_st_style, unsafe_allow_html=True)
    
    # Use model
    if st.button("Let's check this text!"):
        if text.strip() == "":
            st.error("Please enter some text")
        else:
            with st.spinner("Wait for the magic ๐Ÿช„๐Ÿ”ฎ"):
                # Use model
                if option in ("LR Generated", "NB Generated", "LR Revised","NB Revised"):
                    prediction, predict_proba = nb_lr(model, text)
                    st.session_state["sklearn"] = True
                else:
                    prediction, predict_proba = torch_pred(tokenizer, model, format_text(text))
                    st.session_state["torch"] = True

            # Store the result in session state
            st.session_state["color_pred"] = "blue" if prediction == 0 else "red"
            prediction = pred_str(prediction, option)
            st.session_state["prediction"] = prediction
            st.session_state["predict_proba"] = predict_proba
            st.session_state["text"] = text
            
            # Print result
            st.markdown(f"I think this text is: **:{st.session_state['color_pred']}[{st.session_state['prediction']}]** (Confidence: {st.session_state['predict_proba'] * 100}%)")

    elif "prediction" in st.session_state:
        # Display the stored result if available        
        st.markdown(f"I think this text is: **:{st.session_state['color_pred']}[{st.session_state['prediction']}]** (Confidence: {st.session_state['predict_proba'] * 100}%)")

    if st.button("Model Explanation"):
        # Check if there's text in the session state
        if "text" in st.session_state and "prediction" in st.session_state:
           
            if option in ("LR Generated", "NB Generated", "LR Revised","NB Revised"):
                 with st.spinner('Wait for it ๐Ÿ’ญ...'):
                    explainer = TextExplainer(sampler=MaskingTextSampler())
                    explainer.fit(st.session_state["text"], model.predict_proba)
                    html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"]))
                    st.markdown('<span style="color:green"><strong>Green:</strong> Contributes to decision | </span><span style="color:red"><strong>Red:</strong> Opposite</span>', unsafe_allow_html=True)
            else:
                with st.spinner('Wait for it ๐Ÿ’ญ... BERT-based model explanations take around 4-10 minutes. In case you want to abort, please refresh the page.'):
                # TORCH EXPLAINER PRED FUNC (USES logits)
                    def f(x):
                        tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])
                        outputs = model(tv).detach().cpu().numpy()
                        scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
                        val = scipy.special.logit(scores[:,1]) # use one vs rest logit units
                        return val
                    # build an explainer using a token masker
                    explainer = shap.Explainer(f, tokenizer)
                    shap_values = explainer([st.session_state["text"]], fixed_context=1)
                    html = shap.plots.text(shap_values, display=False)
                    st.markdown('<span style="color:blue"><strong>Blue:</strong> Contributes to "human" | </span><span style="color:red"><strong>Red:</strong> Contributes to "AI"</span>', unsafe_allow_html=True)
            # Render HTML
            st.components.v1.html(html, height=500, scrolling = True)
        else:
            st.error("Please enter some text and click 'Let's check!' before requesting an explanation.") 
            
if __name__ == "__main__":
    main()