Huertas97 commited on
Commit
28f4a08
1 Parent(s): 2f9877e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ from streamlit_echarts import st_echarts
4
+ from annotated_text import annotated_text
5
+
6
+ st.set_page_config(
7
+ page_title="LeetSpeak-NER",
8
+ page_icon=":mega:",
9
+ layout="wide",
10
+ initial_sidebar_state="expanded",
11
+ menu_items={
12
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
13
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
14
+ 'About': "# This is a header. This is an *extremely* cool app!"
15
+ }
16
+ )
17
+
18
+
19
+ @st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
20
+ def load_models():
21
+ spanish_model = spacy.load("../spacy-models/toy_output_es_blank/model-best/")
22
+ english_model = spacy.load("../spacy-models/toy_output_en_blank/model-best/")
23
+ models = {"English": english_model, "Spanish": spanish_model}
24
+ return models
25
+
26
+
27
+ # 'INV_CAMO', 'LEETSPEAK', 'MIX', 'PUNCT_CAMO'
28
+ def process_text(doc, selected_multi_ner):
29
+ tokens = []
30
+ for token in doc:
31
+ if selected_multi_ner == "Yes":
32
+ if token.ent_type_ == "INV_CAMO":
33
+ tokens.append((token.text, "INV_CAMO", "#faa"))
34
+ elif token.ent_type_ == "LEETSPEAK":
35
+ tokens.append((token.text, "LEETSPEAK", "#fda"))
36
+ elif token.ent_type_ == "MIX":
37
+ tokens.append((token.text, "MIX", "#afa"))
38
+ elif token.ent_type_ == "PUNCT_CAMO":
39
+ tokens.append((token.text, "PUNCT_CAMO", "#aaaaff"))
40
+ else:
41
+ tokens.append(" " + token.text + " ")
42
+ else:
43
+ if token.ent_type_ in ['INV_CAMO', 'LEETSPEAK', 'MIX', 'PUNCT_CAMO']:
44
+ tokens.append((token.text, "CAMOUFLAGE", "#ffd5aa"))
45
+ else:
46
+ tokens.append(" " + token.text + " ")
47
+
48
+
49
+ return tokens
50
+
51
+
52
+ # Side bar
53
+ selected_language = st.sidebar.selectbox("Select a language", options=["English", "Spanish"])
54
+ selected_multi_ner = st.sidebar.radio('Do you want to break down the Entities detected by type of leetspeak?', ['Yes', 'No'])
55
+
56
+ models = load_models()
57
+ selected_model = models[selected_language]
58
+
59
+ import base64
60
+
61
+ LOGO_IMAGE = "aida_logo.png"
62
+
63
+ st.markdown(
64
+ """
65
+ <style>
66
+ .container {
67
+ display: flex;
68
+
69
+ }
70
+ .logo-img {
71
+ float:right;
72
+ margin-top: 2.2em;
73
+ margin-left: -10em;
74
+ }
75
+ </style>
76
+ """,
77
+ unsafe_allow_html=True
78
+ )
79
+
80
+
81
+ col1, col2 = st.columns([4, 1])
82
+ with col1:
83
+ st.markdown("""
84
+ <style>
85
+ .big-font {
86
+ font-size:3em;
87
+ font-weight: bold;
88
+ }
89
+ </style>
90
+ """, unsafe_allow_html=True)
91
+
92
+ st.markdown('<p class="big-font">Welcome to <font color="#4B8BBE">Leet</font><font color="#FFE873">Speak</font><font color="#ff73a2">-NER</font></p>', unsafe_allow_html=True)
93
+ with col2:
94
+ # st.image('./aida_logo.png')
95
+ st.markdown(
96
+ f"""
97
+ <div class="container">
98
+ <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
99
+ </div>
100
+ """,
101
+ unsafe_allow_html=True
102
+ )
103
+
104
+
105
+
106
+ with st.expander("Project Description", expanded=False):
107
+ st.write("""
108
+ Developed in Applied Intelligence and Data Analysis ([AI+DA](http://aida.etsisi.upm.es/)) group at Polytech University of Madrid (UPM).
109
+ This tool uses a Spacy-Transformer Name Entity Recognition model to detect the presence of word camouflaged. Word camouflage is currently used to evade content moderation in Social Media. Therefore, the aim of this tool is to counter new ways of misinformation that emerge in social media platforms.
110
+
111
+ Currently, two languages are supported: English and Spanish. Additionally, you can select whether the detected entities are broken down into the three types of camouflaged words: Canonical Leetspeak, Punctuation Camouflaged, Inversion Camouflaged.
112
+ """)
113
+
114
+
115
+
116
+
117
+
118
+ st.subheader("Input Text")
119
+
120
+ with st.form("my_form"):
121
+ text_input = st.text_area('Insert a text to detect leetspeak entities. Try for example: "@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)"',
122
+ # placeholder="@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)",
123
+ # value="@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)"
124
+ )
125
+
126
+ uploaded_file = st.file_uploader("or Upload a file", type=["doc", "docx", "pdf", "txt"])
127
+ if uploaded_file is not None:
128
+ text_input = uploaded_file.getvalue()
129
+ text_input = text_input.decode("utf-8")
130
+
131
+ # Every form must have a submit button.
132
+ submitted = st.form_submit_button("Submit")
133
+
134
+
135
+
136
+
137
+ st.subheader("Output")
138
+ doc = selected_model(text_input)
139
+ tokens = process_text(doc, selected_multi_ner)
140
+
141
+ annotated_text(*tokens)
142
+