AliNajafi commited on
Commit
d856fda
1 Parent(s): 2307b31

Add application files

Browse files
.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
Preprocessor/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .demojize import demojize
2
+ from .preprocessor import preprocess
Preprocessor/demojize.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ dir = os.path.dirname(__file__)
5
+ EMOJI_DATA_PATH = os.path.join(dir, "emojis_tr_twitter.json")
6
+
7
+ with open(EMOJI_DATA_PATH, "r") as f:
8
+ emojis = json.load(f)
9
+
10
+ _SEARCH_TREE = None
11
+
12
+
13
+ def _get_search_tree():
14
+ global _SEARCH_TREE
15
+ if _SEARCH_TREE is None:
16
+ _SEARCH_TREE = {}
17
+ for emj in emojis:
18
+ sub_tree = _SEARCH_TREE
19
+ lastidx = len(emj) - 1
20
+ for i, char in enumerate(emj):
21
+ if char not in sub_tree:
22
+ sub_tree[char] = {}
23
+ sub_tree = sub_tree[char]
24
+ if i == lastidx:
25
+ sub_tree["data"] = emojis[emj]
26
+
27
+ return _SEARCH_TREE
28
+
29
+
30
+ def demojize(
31
+ string,
32
+ delimiters=("<emoji> ", " </emoji>"),
33
+ language="tr",
34
+ version=None,
35
+ handle_version=None,
36
+ ):
37
+ if language == "alias":
38
+ language = "tr"
39
+ _use_aliases = True
40
+ else:
41
+ _use_aliases = False
42
+ tree = _get_search_tree()
43
+ result = []
44
+ i = 0
45
+ length = len(string)
46
+ while i < length:
47
+ consumed = False
48
+ char = string[i]
49
+ if char in tree:
50
+ j = i + 1
51
+ sub_tree = tree[char]
52
+ while j < length and string[j] in sub_tree:
53
+ sub_tree = sub_tree[string[j]]
54
+ j += 1
55
+ if "data" in sub_tree:
56
+ emj_data = sub_tree["data"]
57
+ code_points = string[i:j]
58
+ replace_str = None
59
+ if version is not None and emj_data["E"] > version:
60
+ if callable(handle_version):
61
+ emj_data = emj_data.copy()
62
+ emj_data["match_start"] = i
63
+ emj_data["match_end"] = j
64
+ replace_str = handle_version(code_points, emj_data)
65
+ elif handle_version is not None:
66
+ replace_str = str(handle_version)
67
+ else:
68
+ replace_str = None
69
+ elif language in emj_data:
70
+ if _use_aliases and "alias" in emj_data:
71
+ replace_str = (
72
+ delimiters[0] + emj_data["alias"][0][:-1] + delimiters[1]
73
+ )
74
+ else:
75
+ replace_str = (
76
+ delimiters[0] + emj_data[language][1:-1] + delimiters[1]
77
+ )
78
+ else:
79
+ # The emoji exists, but it is not translated, so we keep the emoji
80
+ replace_str = code_points
81
+
82
+ i = j - 1
83
+ consumed = True
84
+ if replace_str:
85
+ result.append(replace_str)
86
+
87
+ if not consumed and char != "\ufe0e" and char != "\ufe0f":
88
+ result.append(char)
89
+ i += 1
90
+
91
+ return "".join(result)
Preprocessor/emojis_tr_twitter.json ADDED
The diff for this file is too large to render. See raw diff
 
Preprocessor/preprocessor.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib
2
+ import html
3
+ import re
4
+
5
+ from urlextract import URLExtract
6
+ from unicodedata import normalize
7
+
8
+ from .demojize import demojize
9
+
10
+
11
+ def hashtag_handler(text: str):
12
+ pattern = r"(#([^\s]+))"
13
+ return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
14
+
15
+
16
+ def cashtag_handler(text: str):
17
+ pattern = r"(\$([^\s]+))"
18
+ return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
19
+
20
+
21
+ def mention_handler(text: str):
22
+ pattern = r"(@([^\s]+))"
23
+ return re.sub(pattern, " @user ", text)
24
+
25
+
26
+ url_extractor = URLExtract()
27
+
28
+
29
+ def url_handler(text: str):
30
+ urls = list(url_extractor.gen_urls(text))
31
+ updated_urls = list(
32
+ set([url if "http" in url else f"https://{url}" for url in urls])
33
+ )
34
+ domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
35
+ for i in range(len(domains)):
36
+ text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
37
+ return text
38
+
39
+
40
+ def email_handler(text: str):
41
+ pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
42
+ match = re.findall(pattern, text)
43
+ for m in match:
44
+ text = text.replace(m, " <email> ").strip()
45
+ return text
46
+
47
+
48
+ def emoji_handler(text: str):
49
+ return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
50
+
51
+
52
+ def normalize_text(text: str):
53
+ return normalize("NFC", text)
54
+
55
+
56
+ def preprocess(text: str):
57
+ output = html.unescape(text)
58
+ output = normalize_text(output)
59
+ output = email_handler(output)
60
+ output = url_handler(output)
61
+ output = hashtag_handler(output)
62
+ output = cashtag_handler(output)
63
+ output = mention_handler(output)
64
+ output = emoji_handler(output)
65
+ output = re.sub(r"\s+", " ", output)
66
+ output = output.lower()
67
+ output = output.strip()
68
+
69
+ return output
70
+
71
+
72
+ if __name__ == "__main__":
73
+ sample_text = ""
74
+ preprocessed_text = preprocess(sample_text)
75
+ print(preprocessed_text)
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from peft import (
4
+ PeftModel,
5
+ PeftConfig,
6
+ )
7
+
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+ from Preprocessor import preprocess
10
+
11
+
12
+ peft_model = "VRLLab/TurkishBERTweet-Lora-SA"
13
+ peft_config = PeftConfig.from_pretrained(peft_model)
14
+
15
+ # loading Tokenizer
16
+ padding_side = "right"
17
+ tokenizer = AutoTokenizer.from_pretrained(
18
+ peft_config.base_model_name_or_path, padding_side=padding_side
19
+ )
20
+ if getattr(tokenizer, "pad_token_id") is None:
21
+ tokenizer.pad_token_id = tokenizer.eos_token_id
22
+
23
+ id2label_sa = {0: "negative", 2: "positive", 1: "neutral"}
24
+ turkishBERTweet_sa = AutoModelForSequenceClassification.from_pretrained(
25
+ peft_config.base_model_name_or_path,
26
+ return_dict=True,
27
+ num_labels=len(id2label_sa),
28
+ id2label=id2label_sa,
29
+ )
30
+ turkishBERTweet_sa = PeftModel.from_pretrained(turkishBERTweet_sa, peft_model)
31
+
32
+
33
+ st.title("Sentiment Analysis with HuggingFace Spaces")
34
+ st.write("Enter a sentence to analyze its sentiment:")
35
+
36
+
37
+ user_input = st.text_input("")
38
+ if user_input:
39
+ with torch.no_grad():
40
+ ids = tokenizer.encode_plus(preprocess(user_input), return_tensors="pt")
41
+ logits = turkishBERTweet_sa(**ids).logits
42
+ label_id = logits.argmax(-1).item()
43
+ confidence = logits.softmax(-1)[0, label_id].item()
44
+ st.write(f"Sentiment: {id2label_sa[label_id]}")
45
+ st.write(f"Confidence: {confidence:.2f}")