Montazer commited on
Commit
693378c
0 Parent(s):

Duplicate from ArefSadeghian/arabert-finetuned-caner

Browse files
Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +90 -0
  4. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ArefSadeghian Arabert Finetuned Caner
3
+ emoji: 🦀
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: gpl
11
+ duplicated_from: ArefSadeghian/arabert-finetuned-caner
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import re
4
+
5
+ HTML_WRAPPER = """<div dir="rtl" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
6
+
7
+ # Replace this with above latest checkpoint
8
+ model_checkpoint = "ArefSadeghian/arabert-finetuned-caner"
9
+ token_classifier = pipeline(
10
+ "token-classification", model=model_checkpoint, aggregation_strategy="simple"
11
+ )
12
+
13
+ import re
14
+ import unicodedata
15
+
16
+ diacritics = {
17
+ '\u064B': None, # FATHATAN
18
+ '\u064C': None, # DAMMATAN
19
+ '\u064D': None, # KASRATAN
20
+ '\u064E': None, # FATHA
21
+ '\u064F': None, # DAMMA
22
+ '\u0650': None, # KASRA
23
+ '\u0651': None, # SHADDA
24
+ '\u0652': None, # SUKUN
25
+ }
26
+
27
+ def remove_diacritics(text):
28
+ normalized_text = unicodedata.normalize('NFKD', text)
29
+ return normalized_text.translate(dict.fromkeys(map(ord, diacritics)))
30
+
31
+ def remove_punctuation(text):
32
+ return re.sub(r'[^\w\s]', '', text)
33
+
34
+ def preprocess_arabic_text(text):
35
+ # Remove diacritics
36
+ text = remove_diacritics(text)
37
+
38
+ # Remove punctuation
39
+ text = remove_punctuation(text)
40
+
41
+ # Normalize whitespace
42
+ text = re.sub(r'\s+', ' ', text)
43
+
44
+ # Convert to lowercase
45
+ text = text.lower()
46
+
47
+ return text
48
+
49
+
50
+ # Define a function to highlight different labels in the text
51
+ def highlight_text(text, entities):
52
+ entity_colors = {"Allah": "#ffe5cc", "Book": "#b3daff", "Clan": "#faedcb", "Crime": "#ffb3d9",
53
+ "Date": "#cce6ff", "Day": "#cce6ff", "Hell": "#d9d9d9", "Loc": "#d9b3ff",
54
+ "Meas": "#e6ccff", "Mon": "#ffd6cc", "Month": "#ffd6cc", "NatOb": "#ffe0b3",
55
+ "Number": "#ffe0cc", "Org": "#c1ffb3", "Para": "#f2f2f2", "Pers": "#b3ffb3",
56
+ "Prophet": "#e6ccff", "Rlig": "#ffff80", "Sect": "#b3d9ff", "Time": "#ffb3ba"}
57
+ highlighted = []
58
+ i = 0
59
+ for entity in entities:
60
+ highlighted.extend(text[i:int(entity['start'])].split())
61
+ entity_group = entity['entity_group']
62
+ score = entity['score']
63
+ marked_text = f'<mark class="{entity_group}" style="background-color: {entity_colors[entity_group]}">{entity["word"]}<sub>{entity_group}</sub><sup>{score:.2f}</sup></mark>'
64
+ highlighted.append(marked_text)
65
+ i = int(entity['end']) + 1
66
+ highlighted.extend(text[i:].split())
67
+ return HTML_WRAPPER.format(' '.join(highlighted))
68
+
69
+
70
+ # Create the Gradio interface
71
+ def predict_ner(text):
72
+ try:
73
+ text = preprocess_arabic_text(text)
74
+ entities = token_classifier(text)
75
+ highlighted_text = highlight_text(text, entities)
76
+ return highlighted_text + '\n\n' + str(entities)
77
+ except Exception as e:
78
+ print(e)
79
+ return str(e)
80
+
81
+
82
+ iface = gr.Interface(
83
+ fn=predict_ner,
84
+ inputs=gr.inputs.Textbox(label="Enter Hadith in Arabic"),
85
+ outputs=gr.outputs.HTML(label="Predicted Labels"),
86
+ title="Hadith Analysis"
87
+ )
88
+
89
+ # Launch the interface
90
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ tashaphyne